Skip to content

Commit b452e89

Browse files
Hyeri1eeericbottard
authored andcommitted
GH-4428: improve TextSplitter with property preservation and tracking
Signed-off-by: Hyeri1ee <haerizian10@gmail.com> Fix test comments and update TokenTextSplitterTest for enhanced metadata - Correct comment: 'excluding' -> 'including' chunk-specific fields - Update TokenTextSplitterTest to handle new metadata fields - Ensure all tests pass with enhanced TextSplitter functionality Signed-off-by: Eric Bottard <eric.bottard@broadcom.com>
1 parent 6854e3e commit b452e89

File tree

4 files changed

+185
-19
lines changed

4 files changed

+185
-19
lines changed

spring-ai-commons/src/main/java/org/springframework/ai/document/id/IdGenerator.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ public interface IdGenerator {
2626

2727
/**
2828
* Generate a unique ID for the given content. Note: some generator, such as the
29-
* random generator might not dependent on or use the content parameters.
29+
* random generator might not depend on or use the content parameters.
3030
* @param contents the content to generate an ID for.
3131
* @return the generated ID.
3232
*/

spring-ai-commons/src/main/java/org/springframework/ai/transformer/splitter/TextSplitter.java

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -63,44 +63,61 @@ private List<Document> doSplitDocuments(List<Document> documents) {
6363
List<String> texts = new ArrayList<>();
6464
List<Map<String, Object>> metadataList = new ArrayList<>();
6565
List<ContentFormatter> formatters = new ArrayList<>();
66+
List<Double> scores = new ArrayList<>();
67+
List<String> originalIds = new ArrayList<>();
6668

6769
for (Document doc : documents) {
6870
texts.add(doc.getText());
6971
metadataList.add(doc.getMetadata());
7072
formatters.add(doc.getContentFormatter());
73+
scores.add(doc.getScore());
74+
originalIds.add(doc.getId());
7175
}
7276

73-
return createDocuments(texts, formatters, metadataList);
77+
return createDocuments(texts, formatters, metadataList, scores, originalIds);
7478
}
7579

7680
private List<Document> createDocuments(List<String> texts, List<ContentFormatter> formatters,
77-
List<Map<String, Object>> metadataList) {
81+
List<Map<String, Object>> metadataList, List<Double> scores, List<String> originalIds) {
7882

7983
// Process the data in a column oriented way and recreate the Document
8084
List<Document> documents = new ArrayList<>();
8185

8286
for (int i = 0; i < texts.size(); i++) {
8387
String text = texts.get(i);
8488
Map<String, Object> metadata = metadataList.get(i);
89+
Double originalScore = scores.get(i);
90+
String originalId = originalIds.get(i);
91+
8592
List<String> chunks = splitText(text);
8693
if (chunks.size() > 1) {
8794
logger.info("Splitting up document into {} chunks.", chunks.size());
8895
}
89-
for (String chunk : chunks) {
90-
// only primitive values are in here -
91-
Map<String, Object> metadataCopy = metadata.entrySet()
96+
97+
for (int chunkIndex = 0; chunkIndex < chunks.size(); chunkIndex++) {
98+
String chunk = chunks.get(chunkIndex);
99+
100+
Map<String, Object> enhancedMetadata = metadata.entrySet()
92101
.stream()
93102
.filter(e -> e.getKey() != null && e.getValue() != null)
94103
.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
95-
Document newDoc = new Document(chunk, metadataCopy);
104+
105+
enhancedMetadata.put("parent_document_id", originalId);
106+
enhancedMetadata.put("chunk_index", chunkIndex);
107+
enhancedMetadata.put("total_chunks", chunks.size());
108+
109+
Document newDoc = Document.builder()
110+
.text(chunk)
111+
.metadata(enhancedMetadata)
112+
.score(originalScore)
113+
.build();
96114

97115
if (this.copyContentFormatter) {
98116
// Transfer the content-formatter of the parent to the chunked
99-
// documents it was slit into.
117+
// documents it was split into.
100118
newDoc.setContentFormatter(formatters.get(i));
101119
}
102120

103-
// TODO copy over other properties.
104121
documents.add(newDoc);
105122
}
106123
}

spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java

Lines changed: 143 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -82,9 +82,22 @@ public void testSplitText() {
8282
assertThat(chunks.get(3).getText())
8383
.isEqualTo("choose. It isn’t the lack of an exit, but the abundance of exits that is so disorienting.");
8484

85-
// Verify that the same, merged metadata is copied to all chunks.
86-
assertThat(chunks.get(0).getMetadata()).isEqualTo(chunks.get(1).getMetadata());
87-
assertThat(chunks.get(2).getMetadata()).isEqualTo(chunks.get(3).getMetadata());
85+
// Verify that the original metadata is copied to all chunks (including
86+
// chunk-specific fields)
87+
assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "key2", "parent_document_id", "chunk_index",
88+
"total_chunks");
89+
assertThat(chunks.get(1).getMetadata()).containsKeys("key1", "key2", "parent_document_id", "chunk_index",
90+
"total_chunks");
91+
assertThat(chunks.get(2).getMetadata()).containsKeys("key2", "key3", "parent_document_id", "chunk_index",
92+
"total_chunks");
93+
assertThat(chunks.get(3).getMetadata()).containsKeys("key2", "key3", "parent_document_id", "chunk_index",
94+
"total_chunks");
95+
96+
// Verify chunk indices are correct
97+
assertThat(chunks.get(0).getMetadata().get("chunk_index")).isEqualTo(0);
98+
assertThat(chunks.get(1).getMetadata().get("chunk_index")).isEqualTo(1);
99+
assertThat(chunks.get(2).getMetadata().get("chunk_index")).isEqualTo(0);
100+
assertThat(chunks.get(3).getMetadata().get("chunk_index")).isEqualTo(1);
88101
assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "key2").doesNotContainKeys("key3");
89102
assertThat(chunks.get(2).getMetadata()).containsKeys("key2", "key3").doesNotContainKeys("key1");
90103

@@ -148,7 +161,6 @@ public void pageNoChunkSplit() {
148161
@Test
149162
public void pageWithChunkSplit() {
150163
// given
151-
152164
var doc1 = new Document("1In the end, writing arises when man realizes that memory is not enough."
153165
+ "1The most oppressive thing about the labyrinth is that you are constantly "
154166
+ "1being forced to choose. It isn’t the lack of an exit, but the abundance of exits that is so disorienting.",
@@ -236,13 +248,137 @@ public void testSplitTextWithNullMetadata() {
236248
assertThat(chunks.get(0).getText()).isEqualTo("In the end, writing arises when man");
237249
assertThat(chunks.get(1).getText()).isEqualTo(" realizes that memory is not enough.");
238250

239-
// Verify that the same, merged metadata is copied to all chunks.
240-
assertThat(chunks.get(0).getMetadata()).isEqualTo(chunks.get(1).getMetadata());
241-
assertThat(chunks.get(1).getMetadata()).containsKeys("key1");
251+
// Verify that the original metadata is copied to all chunks (with chunk-specific
252+
// fields)
253+
assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "parent_document_id", "chunk_index",
254+
"total_chunks");
255+
assertThat(chunks.get(1).getMetadata()).containsKeys("key1", "parent_document_id", "chunk_index",
256+
"total_chunks");
257+
258+
// Verify chunk indices are different
259+
assertThat(chunks.get(0).getMetadata().get("chunk_index")).isEqualTo(0);
260+
assertThat(chunks.get(1).getMetadata().get("chunk_index")).isEqualTo(1);
242261

243262
// Verify that the content formatters are copied from the parents to the chunks.
244263
assertThat(chunks.get(0).getContentFormatter()).isSameAs(contentFormatter);
245264
assertThat(chunks.get(1).getContentFormatter()).isSameAs(contentFormatter);
246265
}
247266

267+
@Test
268+
public void testScorePreservation() {
269+
// given
270+
Double originalScore = 0.95;
271+
var doc = Document.builder()
272+
.text("This is a test document that will be split into multiple chunks.")
273+
.metadata(Map.of("source", "test.txt"))
274+
.score(originalScore)
275+
.build();
276+
277+
// when
278+
List<Document> chunks = testTextSplitter.apply(List.of(doc));
279+
280+
// then
281+
assertThat(chunks).hasSize(2);
282+
assertThat(chunks.get(0).getScore()).isEqualTo(originalScore);
283+
assertThat(chunks.get(1).getScore()).isEqualTo(originalScore);
284+
}
285+
286+
@Test
287+
public void testParentDocumentTracking() {
288+
// given
289+
var doc1 = new Document("First document content for testing splitting functionality.",
290+
Map.of("source", "doc1.txt"));
291+
var doc2 = new Document("Second document content for testing splitting functionality.",
292+
Map.of("source", "doc2.txt"));
293+
294+
String originalId1 = doc1.getId();
295+
String originalId2 = doc2.getId();
296+
297+
// when
298+
List<Document> chunks = testTextSplitter.apply(List.of(doc1, doc2));
299+
300+
// then
301+
assertThat(chunks).hasSize(4);
302+
303+
// Verify parent document tracking for doc1 chunks
304+
assertThat(chunks.get(0).getMetadata().get("parent_document_id")).isEqualTo(originalId1);
305+
assertThat(chunks.get(1).getMetadata().get("parent_document_id")).isEqualTo(originalId1);
306+
307+
// Verify parent document tracking for doc2 chunks
308+
assertThat(chunks.get(2).getMetadata().get("parent_document_id")).isEqualTo(originalId2);
309+
assertThat(chunks.get(3).getMetadata().get("parent_document_id")).isEqualTo(originalId2);
310+
}
311+
312+
@Test
313+
public void testChunkMetadataInformation() {
314+
// given
315+
var doc = new Document("This is a longer document that will be split into exactly two chunks for testing.",
316+
Map.of("source", "test.txt"));
317+
318+
// when
319+
List<Document> chunks = testTextSplitter.apply(List.of(doc));
320+
321+
// then
322+
assertThat(chunks).hasSize(2);
323+
324+
// Verify chunk index and total chunks for first chunk
325+
assertThat(chunks.get(0).getMetadata().get("chunk_index")).isEqualTo(0);
326+
assertThat(chunks.get(0).getMetadata().get("total_chunks")).isEqualTo(2);
327+
328+
// Verify chunk index and total chunks for second chunk
329+
assertThat(chunks.get(1).getMetadata().get("chunk_index")).isEqualTo(1);
330+
assertThat(chunks.get(1).getMetadata().get("total_chunks")).isEqualTo(2);
331+
332+
// Verify original metadata is preserved
333+
assertThat(chunks.get(0).getMetadata().get("source")).isEqualTo("test.txt");
334+
assertThat(chunks.get(1).getMetadata().get("source")).isEqualTo("test.txt");
335+
}
336+
337+
@Test
338+
public void testEnhancedMetadataWithMultipleDocuments() {
339+
// given
340+
var doc1 = Document.builder()
341+
.text("First document with score and metadata.")
342+
.metadata(Map.of("type", "article", "priority", "high"))
343+
.score(0.8)
344+
.build();
345+
346+
var doc2 = Document.builder()
347+
.text("Second document with different score.")
348+
.metadata(Map.of("type", "report", "priority", "medium"))
349+
.score(0.6)
350+
.build();
351+
352+
String originalId1 = doc1.getId();
353+
String originalId2 = doc2.getId();
354+
355+
// when
356+
List<Document> chunks = testTextSplitter.apply(List.of(doc1, doc2));
357+
358+
// then
359+
assertThat(chunks).hasSize(4);
360+
361+
// Verify first document chunks
362+
for (int i = 0; i < 2; i++) {
363+
Document chunk = chunks.get(i);
364+
assertThat(chunk.getScore()).isEqualTo(0.8);
365+
assertThat(chunk.getMetadata().get("parent_document_id")).isEqualTo(originalId1);
366+
assertThat(chunk.getMetadata().get("chunk_index")).isEqualTo(i);
367+
assertThat(chunk.getMetadata().get("total_chunks")).isEqualTo(2);
368+
assertThat(chunk.getMetadata().get("type")).isEqualTo("article");
369+
assertThat(chunk.getMetadata().get("priority")).isEqualTo("high");
370+
}
371+
372+
// Verify second document chunks
373+
for (int i = 2; i < 4; i++) {
374+
Document chunk = chunks.get(i);
375+
assertThat(chunk.getScore()).isEqualTo(0.6);
376+
assertThat(chunk.getMetadata().get("parent_document_id")).isEqualTo(originalId2);
377+
assertThat(chunk.getMetadata().get("chunk_index")).isEqualTo(i - 2);
378+
assertThat(chunk.getMetadata().get("total_chunks")).isEqualTo(2);
379+
assertThat(chunk.getMetadata().get("type")).isEqualTo("report");
380+
assertThat(chunk.getMetadata().get("priority")).isEqualTo("medium");
381+
}
382+
}
383+
248384
}

spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TokenTextSplitterTest.java

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -104,9 +104,22 @@ public void testTokenTextSplitterBuilderWithAllFields() {
104104
assertThat(chunks.get(4).getText()).isEqualTo("It isn’t the lack of an exit, but");
105105
assertThat(chunks.get(5).getText()).isEqualTo("the abundance of exits that is so disorienting");
106106

107-
// Verify that the same, merged metadata is copied to all chunks.
108-
assertThat(chunks.get(0).getMetadata()).isEqualTo(chunks.get(1).getMetadata());
109-
assertThat(chunks.get(2).getMetadata()).isEqualTo(chunks.get(3).getMetadata());
107+
// Verify that the original metadata is copied to all chunks (including
108+
// chunk-specific fields)
109+
assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "key2", "parent_document_id", "chunk_index",
110+
"total_chunks");
111+
assertThat(chunks.get(1).getMetadata()).containsKeys("key1", "key2", "parent_document_id", "chunk_index",
112+
"total_chunks");
113+
assertThat(chunks.get(2).getMetadata()).containsKeys("key2", "key3", "parent_document_id", "chunk_index",
114+
"total_chunks");
115+
assertThat(chunks.get(3).getMetadata()).containsKeys("key2", "key3", "parent_document_id", "chunk_index",
116+
"total_chunks");
117+
118+
// Verify chunk indices are correct
119+
assertThat(chunks.get(0).getMetadata().get("chunk_index")).isEqualTo(0);
120+
assertThat(chunks.get(1).getMetadata().get("chunk_index")).isEqualTo(1);
121+
assertThat(chunks.get(2).getMetadata().get("chunk_index")).isEqualTo(0);
122+
assertThat(chunks.get(3).getMetadata().get("chunk_index")).isEqualTo(1);
110123

111124
assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "key2").doesNotContainKeys("key3");
112125
assertThat(chunks.get(2).getMetadata()).containsKeys("key2", "key3").doesNotContainKeys("key1");

0 commit comments

Comments
 (0)