GH-4428: improve TextSplitter with property preservation and tracking

Hyeri1ee · ericbottard · commit b452e893be77 · 2025-10-24T12:23:19.000+02:00
Signed-off-by: Hyeri1ee &lt;haerizian10@gmail.com&gt;

Fix test comments and update TokenTextSplitterTest for enhanced metadata

- Correct comment: 'excluding' -&gt; 'including' chunk-specific fields
- Update TokenTextSplitterTest to handle new metadata fields
- Ensure all tests pass with enhanced TextSplitter functionality

Signed-off-by: Eric Bottard &lt;eric.bottard@broadcom.com&gt;
diff --git a/spring-ai-commons/src/main/java/org/springframework/ai/document/id/IdGenerator.java b/spring-ai-commons/src/main/java/org/springframework/ai/document/id/IdGenerator.java
@@ -26,7 +26,7 @@ public interface IdGenerator {
 
 	/**
 	 * Generate a unique ID for the given content. Note: some generator, such as the
-	 * random generator might not dependent on or use the content parameters.
+	 * random generator might not depend on or use the content parameters.
 	 * @param contents the content to generate an ID for.
 	 * @return the generated ID.
 	 */
diff --git a/spring-ai-commons/src/main/java/org/springframework/ai/transformer/splitter/TextSplitter.java b/spring-ai-commons/src/main/java/org/springframework/ai/transformer/splitter/TextSplitter.java
@@ -63,44 +63,61 @@ private List<Document> doSplitDocuments(List<Document> documents) {
 		List<String> texts = new ArrayList<>();
 		List<Map<String, Object>> metadataList = new ArrayList<>();
 		List<ContentFormatter> formatters = new ArrayList<>();
+		List<Double> scores = new ArrayList<>();
+		List<String> originalIds = new ArrayList<>();
 
 		for (Document doc : documents) {
 			texts.add(doc.getText());
 			metadataList.add(doc.getMetadata());
 			formatters.add(doc.getContentFormatter());
+			scores.add(doc.getScore());
+			originalIds.add(doc.getId());
 		}
 
-		return createDocuments(texts, formatters, metadataList);
+		return createDocuments(texts, formatters, metadataList, scores, originalIds);
 	}
 
 	private List<Document> createDocuments(List<String> texts, List<ContentFormatter> formatters,
-			List<Map<String, Object>> metadataList) {
+			List<Map<String, Object>> metadataList, List<Double> scores, List<String> originalIds) {
 
 		// Process the data in a column oriented way and recreate the Document
 		List<Document> documents = new ArrayList<>();
 
 		for (int i = 0; i < texts.size(); i++) {
 			String text = texts.get(i);
 			Map<String, Object> metadata = metadataList.get(i);
+			Double originalScore = scores.get(i);
+			String originalId = originalIds.get(i);
+
 			List<String> chunks = splitText(text);
 			if (chunks.size() > 1) {
 				logger.info("Splitting up document into {} chunks.", chunks.size());
 			}
-			for (String chunk : chunks) {
-				// only primitive values are in here -
-				Map<String, Object> metadataCopy = metadata.entrySet()
+
+			for (int chunkIndex = 0; chunkIndex < chunks.size(); chunkIndex++) {
+				String chunk = chunks.get(chunkIndex);
+
+				Map<String, Object> enhancedMetadata = metadata.entrySet()
 					.stream()
 					.filter(e -> e.getKey() != null && e.getValue() != null)
 					.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
-				Document newDoc = new Document(chunk, metadataCopy);
+
+				enhancedMetadata.put("parent_document_id", originalId);
+				enhancedMetadata.put("chunk_index", chunkIndex);
+				enhancedMetadata.put("total_chunks", chunks.size());
+
+				Document newDoc = Document.builder()
+					.text(chunk)
+					.metadata(enhancedMetadata)
+					.score(originalScore)
+					.build();
 
 				if (this.copyContentFormatter) {
 					// Transfer the content-formatter of the parent to the chunked
-					// documents it was slit into.
+					// documents it was split into.
 					newDoc.setContentFormatter(formatters.get(i));
 				}
 
-				// TODO copy over other properties.
 				documents.add(newDoc);
 			}
 		}
diff --git a/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java b/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java
@@ -82,9 +82,22 @@ public void testSplitText() {
 		assertThat(chunks.get(3).getText())
 			.isEqualTo("choose. It isn’t the lack of an exit, but the abundance of exits that is so disorienting.");
 
-		// Verify that the same, merged metadata is copied to all chunks.
-		assertThat(chunks.get(0).getMetadata()).isEqualTo(chunks.get(1).getMetadata());
-		assertThat(chunks.get(2).getMetadata()).isEqualTo(chunks.get(3).getMetadata());
+		// Verify that the original metadata is copied to all chunks (including
+		// chunk-specific fields)
+		assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "key2", "parent_document_id", "chunk_index",
+				"total_chunks");
+		assertThat(chunks.get(1).getMetadata()).containsKeys("key1", "key2", "parent_document_id", "chunk_index",
+				"total_chunks");
+		assertThat(chunks.get(2).getMetadata()).containsKeys("key2", "key3", "parent_document_id", "chunk_index",
+				"total_chunks");
+		assertThat(chunks.get(3).getMetadata()).containsKeys("key2", "key3", "parent_document_id", "chunk_index",
+				"total_chunks");
+
+		// Verify chunk indices are correct
+		assertThat(chunks.get(0).getMetadata().get("chunk_index")).isEqualTo(0);
+		assertThat(chunks.get(1).getMetadata().get("chunk_index")).isEqualTo(1);
+		assertThat(chunks.get(2).getMetadata().get("chunk_index")).isEqualTo(0);
+		assertThat(chunks.get(3).getMetadata().get("chunk_index")).isEqualTo(1);
 		assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "key2").doesNotContainKeys("key3");
 		assertThat(chunks.get(2).getMetadata()).containsKeys("key2", "key3").doesNotContainKeys("key1");
 
@@ -148,7 +161,6 @@ public void pageNoChunkSplit() {
 	@Test
 	public void pageWithChunkSplit() {
 		// given
-
 		var doc1 = new Document("1In the end, writing arises when man realizes that memory is not enough."
 				+ "1The most oppressive thing about the labyrinth is that you are constantly "
 				+ "1being forced to choose. It isn’t the lack of an exit, but the abundance of exits that is so disorienting.",
@@ -236,13 +248,137 @@ public void testSplitTextWithNullMetadata() {
 		assertThat(chunks.get(0).getText()).isEqualTo("In the end, writing arises when man");
 		assertThat(chunks.get(1).getText()).isEqualTo(" realizes that memory is not enough.");
 
-		// Verify that the same, merged metadata is copied to all chunks.
-		assertThat(chunks.get(0).getMetadata()).isEqualTo(chunks.get(1).getMetadata());
-		assertThat(chunks.get(1).getMetadata()).containsKeys("key1");
+		// Verify that the original metadata is copied to all chunks (with chunk-specific
+		// fields)
+		assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "parent_document_id", "chunk_index",
+				"total_chunks");
+		assertThat(chunks.get(1).getMetadata()).containsKeys("key1", "parent_document_id", "chunk_index",
+				"total_chunks");
+
+		// Verify chunk indices are different
+		assertThat(chunks.get(0).getMetadata().get("chunk_index")).isEqualTo(0);
+		assertThat(chunks.get(1).getMetadata().get("chunk_index")).isEqualTo(1);
 
 		// Verify that the content formatters are copied from the parents to the chunks.
 		assertThat(chunks.get(0).getContentFormatter()).isSameAs(contentFormatter);
 		assertThat(chunks.get(1).getContentFormatter()).isSameAs(contentFormatter);
 	}
 
+	@Test
+	public void testScorePreservation() {
+		// given
+		Double originalScore = 0.95;
+		var doc = Document.builder()
+			.text("This is a test document that will be split into multiple chunks.")
+			.metadata(Map.of("source", "test.txt"))
+			.score(originalScore)
+			.build();
+
+		// when
+		List<Document> chunks = testTextSplitter.apply(List.of(doc));
+
+		// then
+		assertThat(chunks).hasSize(2);
+		assertThat(chunks.get(0).getScore()).isEqualTo(originalScore);
+		assertThat(chunks.get(1).getScore()).isEqualTo(originalScore);
+	}
+
+	@Test
+	public void testParentDocumentTracking() {
+		// given
+		var doc1 = new Document("First document content for testing splitting functionality.",
+				Map.of("source", "doc1.txt"));
+		var doc2 = new Document("Second document content for testing splitting functionality.",
+				Map.of("source", "doc2.txt"));
+
+		String originalId1 = doc1.getId();
+		String originalId2 = doc2.getId();
+
+		// when
+		List<Document> chunks = testTextSplitter.apply(List.of(doc1, doc2));
+
+		// then
+		assertThat(chunks).hasSize(4);
+
+		// Verify parent document tracking for doc1 chunks
+		assertThat(chunks.get(0).getMetadata().get("parent_document_id")).isEqualTo(originalId1);
+		assertThat(chunks.get(1).getMetadata().get("parent_document_id")).isEqualTo(originalId1);
+
+		// Verify parent document tracking for doc2 chunks
+		assertThat(chunks.get(2).getMetadata().get("parent_document_id")).isEqualTo(originalId2);
+		assertThat(chunks.get(3).getMetadata().get("parent_document_id")).isEqualTo(originalId2);
+	}
+
+	@Test
+	public void testChunkMetadataInformation() {
+		// given
+		var doc = new Document("This is a longer document that will be split into exactly two chunks for testing.",
+				Map.of("source", "test.txt"));
+
+		// when
+		List<Document> chunks = testTextSplitter.apply(List.of(doc));
+
+		// then
+		assertThat(chunks).hasSize(2);
+
+		// Verify chunk index and total chunks for first chunk
+		assertThat(chunks.get(0).getMetadata().get("chunk_index")).isEqualTo(0);
+		assertThat(chunks.get(0).getMetadata().get("total_chunks")).isEqualTo(2);
+
+		// Verify chunk index and total chunks for second chunk
+		assertThat(chunks.get(1).getMetadata().get("chunk_index")).isEqualTo(1);
+		assertThat(chunks.get(1).getMetadata().get("total_chunks")).isEqualTo(2);
+
+		// Verify original metadata is preserved
+		assertThat(chunks.get(0).getMetadata().get("source")).isEqualTo("test.txt");
+		assertThat(chunks.get(1).getMetadata().get("source")).isEqualTo("test.txt");
+	}
+
+	@Test
+	public void testEnhancedMetadataWithMultipleDocuments() {
+		// given
+		var doc1 = Document.builder()
+			.text("First document with score and metadata.")
+			.metadata(Map.of("type", "article", "priority", "high"))
+			.score(0.8)
+			.build();
+
+		var doc2 = Document.builder()
+			.text("Second document with different score.")
+			.metadata(Map.of("type", "report", "priority", "medium"))
+			.score(0.6)
+			.build();
+
+		String originalId1 = doc1.getId();
+		String originalId2 = doc2.getId();
+
+		// when
+		List<Document> chunks = testTextSplitter.apply(List.of(doc1, doc2));
+
+		// then
+		assertThat(chunks).hasSize(4);
+
+		// Verify first document chunks
+		for (int i = 0; i < 2; i++) {
+			Document chunk = chunks.get(i);
+			assertThat(chunk.getScore()).isEqualTo(0.8);
+			assertThat(chunk.getMetadata().get("parent_document_id")).isEqualTo(originalId1);
+			assertThat(chunk.getMetadata().get("chunk_index")).isEqualTo(i);
+			assertThat(chunk.getMetadata().get("total_chunks")).isEqualTo(2);
+			assertThat(chunk.getMetadata().get("type")).isEqualTo("article");
+			assertThat(chunk.getMetadata().get("priority")).isEqualTo("high");
+		}
+
+		// Verify second document chunks
+		for (int i = 2; i < 4; i++) {
+			Document chunk = chunks.get(i);
+			assertThat(chunk.getScore()).isEqualTo(0.6);
+			assertThat(chunk.getMetadata().get("parent_document_id")).isEqualTo(originalId2);
+			assertThat(chunk.getMetadata().get("chunk_index")).isEqualTo(i - 2);
+			assertThat(chunk.getMetadata().get("total_chunks")).isEqualTo(2);
+			assertThat(chunk.getMetadata().get("type")).isEqualTo("report");
+			assertThat(chunk.getMetadata().get("priority")).isEqualTo("medium");
+		}
+	}
+
 }
diff --git a/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TokenTextSplitterTest.java b/spring-ai-commons/src/test/java/org/springframework/ai/transformer/splitter/TokenTextSplitterTest.java
@@ -104,9 +104,22 @@ public void testTokenTextSplitterBuilderWithAllFields() {
 		assertThat(chunks.get(4).getText()).isEqualTo("It isn’t the lack of an exit, but");
 		assertThat(chunks.get(5).getText()).isEqualTo("the abundance of exits that is so disorienting");
 
-		// Verify that the same, merged metadata is copied to all chunks.
-		assertThat(chunks.get(0).getMetadata()).isEqualTo(chunks.get(1).getMetadata());
-		assertThat(chunks.get(2).getMetadata()).isEqualTo(chunks.get(3).getMetadata());
+		// Verify that the original metadata is copied to all chunks (including
+		// chunk-specific fields)
+		assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "key2", "parent_document_id", "chunk_index",
+				"total_chunks");
+		assertThat(chunks.get(1).getMetadata()).containsKeys("key1", "key2", "parent_document_id", "chunk_index",
+				"total_chunks");
+		assertThat(chunks.get(2).getMetadata()).containsKeys("key2", "key3", "parent_document_id", "chunk_index",
+				"total_chunks");
+		assertThat(chunks.get(3).getMetadata()).containsKeys("key2", "key3", "parent_document_id", "chunk_index",
+				"total_chunks");
+
+		// Verify chunk indices are correct
+		assertThat(chunks.get(0).getMetadata().get("chunk_index")).isEqualTo(0);
+		assertThat(chunks.get(1).getMetadata().get("chunk_index")).isEqualTo(1);
+		assertThat(chunks.get(2).getMetadata().get("chunk_index")).isEqualTo(0);
+		assertThat(chunks.get(3).getMetadata().get("chunk_index")).isEqualTo(1);
 
 		assertThat(chunks.get(0).getMetadata()).containsKeys("key1", "key2").doesNotContainKeys("key3");
 		assertThat(chunks.get(2).getMetadata()).containsKeys("key2", "key3").doesNotContainKeys("key1");