From 3bec210b802e7238dd096017e72d09db7a628864 Mon Sep 17 00:00:00 2001 From: jiwoo Date: Sun, 23 Jun 2024 22:10:56 +0900 Subject: [PATCH] Add additional tests for Document metadata and TextSplitter --- .../splitter/TextSplitterTests.java | 108 ++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/spring-ai-core/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java b/spring-ai-core/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java index 6c149dd04a0..ed6044da5e2 100644 --- a/spring-ai-core/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java +++ b/spring-ai-core/src/test/java/org/springframework/ai/transformer/splitter/TextSplitterTests.java @@ -25,9 +25,11 @@ import org.springframework.ai.document.Document; import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.jupiter.api.Assertions.assertAll; /** * @author Christian Tzolov + * @author Jiwoo Kim */ public class TextSplitterTests { @@ -105,4 +107,110 @@ public void testSplitText() { } + @Test + public void pageNoChunkSplit() { + // given + var doc1 = new Document("1In the end, writing arises when man realizes that memory is not enough." + + "1The most oppressive thing about the labyrinth is that you are constantly " + + "1being forced to choose. It isn’t the lack of an exit, but the abundance of exits that is so disorienting.", + Map.of("file_name", "sample1.pdf", "page_number", 1)); + + var doc2 = new Document("2In the end, writing arises when man realizes that memory is not enough." + + "2The most oppressive thing about the labyrinth is that you are constantly " + + "2being forced to choose. It isn’t the lack of an exit, but the abundance of exits that is so disorienting.", + Map.of("file_name", "sample1.pdf", "page_number", 2)); + + var doc3 = new Document("3In the end, writing arises when man realizes that memory is not enough." + + "3The most oppressive thing about the labyrinth is that you are constantly " + + "3being forced to choose. It isn’t the lack of an exit, but the abundance of exits that is so disorienting.", + Map.of("file_name", "sample1.pdf", "page_number", 3)); + + var doc4 = new Document("4In the end, writing arises when man realizes that memory is not enough." + + "4The most oppressive thing about the labyrinth is that you are constantly " + + "4being forced to choose. It isn’t the lack of an exit, but the abundance of exits that is so disorienting.", + Map.of("file_name", "sample1.pdf", "page_number", 4)); + + var tokenTextSplitter = new TokenTextSplitter(); + + // when + List splitedDocument = tokenTextSplitter.apply(List.of(doc1, doc2, doc3, doc4)); + + // then + assertAll(() -> assertThat(splitedDocument).isNotNull(), () -> assertThat(splitedDocument).isNotEmpty(), + () -> assertThat(splitedDocument).hasSize(4), + () -> assertThat(splitedDocument.get(0).getMetadata().get("page_number")).isEqualTo(1), + () -> assertThat(splitedDocument.get(1).getMetadata().get("page_number")).isEqualTo(2), + () -> assertThat(splitedDocument.get(2).getMetadata().get("page_number")).isEqualTo(3), + () -> assertThat(splitedDocument.get(3).getMetadata().get("page_number")).isEqualTo(4)); + } + + @Test + public void pageWithChunkSplit() { + // given + + var doc1 = new Document("1In the end, writing arises when man realizes that memory is not enough." + + "1The most oppressive thing about the labyrinth is that you are constantly " + + "1being forced to choose. It isn’t the lack of an exit, but the abundance of exits that is so disorienting.", + Map.of("file_name", "sample1.pdf", "page_number", 1)); + + var doc2 = new Document( + "levels, their care providers, legal representatives and families get the right home and \n" + + " community-based support and services at the right time, in the right place. Please click here to \n" + + " go to Community Living Connections. \n" + + "\n" + + " I am trying to register as a consumer, but Carina will not recognize me or my \n" + + " information. What should I do? \n" + + "\n" + + " Please double check your form entries including the spelling of your name and your \n" + + " ProviderOne number, or last four digits of your social security number and date of birth. Please \n" + + " use the name you have on file with the Department of Social and Health Services (DSHS). Also \n" + + " make sure you have a current or pending assessment with DSHS. \n" + + "\n" + + " If you are having trouble registering, please contact us or call us at 1-855-796-0605. \n" + + "\n" + + " The Home Care Referral Registry has been absorbed by Consumer Direct Care \n" + + " Network Washington (CDWA). Who can help me find care on Carina? \n" + + "\n" + + " Consumer Direct Care Network Washington (CDWA) has taken over from the Home Care \n" + + " Referral Registry (HCRR). CDWA is responsible for assisting consumers and Individual Providers \n" + + " (IPs) to use Carina to find matches. CDWA staff are available across the state to assist \n" + + " consumers to sign up in the Carina system and help IPs get (re)contracted or hired to work. \n" + + "\n" + + " What are some good interview questions I should ask providers? \n" + + "\n" + + " Your approach to the interview is important, you are offering a job to someone who is looking \n" + + " for work. The person you interview may be nervous. Put them at ease, call them by their first \n" + + " name, maintain eye contact and tell them a little about yourself. Read more tips and specific \n" + + " interview questions in our blog: What to Ask Potential Providers. \n" + + "\n" + + " I am ready to hire a home care provider! \n" + + "\n" + + " You found an Individual Provider (IP) that you would like to hire? That is exciting! In order for \n" + + " them to start working, contact Consumer Direct Care Network Washington (CDWA) and request \n" + + " authorization. They cannot start work before you have received an Okay to Work from CDWA. \n" + + "\n" + + " Consumers should continue to work with their case manager, who will help you create a Plan of \n" + + " Care and access needed services.\n" + + "Once you have decided on an IP to work with, they should\n" + "\n", + Map.of("file_name", "sample1.pdf", "page_number", 2)); + + var doc3 = new Document("3In the end, writing arises when man realizes that memory is not enough." + + "3The most oppressive thing about the labyrinth is that you are constantly " + + "3being forced to choose. It isn’t the lack of an exit, but the abundance of exits that is so disorienting.", + Map.of("file_name", "sample1.pdf", "page_number", 3)); + + var tokenTextSplitter = new TokenTextSplitter(); + + // when + List splitedDocument = tokenTextSplitter.apply(List.of(doc1, doc2, doc3)); + + // then + assertAll(() -> assertThat(splitedDocument).isNotNull(), () -> assertThat(splitedDocument).isNotEmpty(), + () -> assertThat(splitedDocument).hasSize(4), + () -> assertThat(splitedDocument.get(0).getMetadata().get("page_number")).isEqualTo(1), + () -> assertThat(splitedDocument.get(1).getMetadata().get("page_number")).isEqualTo(2), + () -> assertThat(splitedDocument.get(2).getMetadata().get("page_number")).isEqualTo(2), + () -> assertThat(splitedDocument.get(3).getMetadata().get("page_number")).isEqualTo(3)); + } + }