Skip to content

Commit aeb9f8a

Browse files
jiafu1115ericbottard
authored andcommitted
Fix PDF grouping logic to respect pagesPerDocument
Signed-off-by: stroller <fujian1115@gmail.com> Auto-cherry-pick to 1.0.x Signed-off-by: Eric Bottard <eric.bottard@broadcom.com>
1 parent 1780e88 commit aeb9f8a

File tree

2 files changed

+87
-49
lines changed

2 files changed

+87
-49
lines changed

document-readers/pdf-reader/src/main/java/org/springframework/ai/reader/pdf/PagePdfDocumentReader.java

Lines changed: 47 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import org.apache.pdfbox.pdfparser.PDFParser;
2626
import org.apache.pdfbox.pdmodel.PDDocument;
2727
import org.apache.pdfbox.pdmodel.PDPage;
28+
import org.apache.pdfbox.pdmodel.PDPageTree;
2829
import org.slf4j.Logger;
2930
import org.slf4j.LoggerFactory;
3031

@@ -44,6 +45,7 @@
4445
* pageBottomMargin = 0
4546
*
4647
* @author Christian Tzolov
48+
* @author Fu Jian
4749
*/
4850
public class PagePdfDocumentReader implements DocumentReader {
4951

@@ -96,74 +98,70 @@ public List<Document> get() {
9698
try {
9799
var pdfTextStripper = new PDFLayoutTextStripperByArea();
98100

99-
int pageNumber = 0;
100-
int pagesPerDocument = 0;
101-
int startPageNumber = pageNumber;
101+
int pageNumber = 1;
102+
int startPageNumber = 1;
102103

103104
List<String> pageTextGroupList = new ArrayList<>();
104105

105-
int totalPages = this.document.getDocumentCatalog().getPages().getCount();
106-
int logFrequency = totalPages > 10 ? totalPages / 10 : 1; // if less than 10
107-
// pages, print
108-
// each iteration
109-
int counter = 0;
110-
111-
PDPage lastPage = this.document.getDocumentCatalog().getPages().iterator().next();
112-
for (PDPage page : this.document.getDocumentCatalog().getPages()) {
113-
lastPage = page;
114-
if (counter % logFrequency == 0 && counter / logFrequency < 10) {
115-
logger.info("Processing PDF page: {}", (counter + 1));
116-
}
117-
counter++;
106+
PDPageTree pages = this.document.getDocumentCatalog().getPages();
107+
int totalPages = pages.getCount();
108+
int logFrequency = totalPages > 10 ? totalPages / 10 : 1;
118109

119-
pagesPerDocument++;
110+
int pagesPerDocument = getPagesPerDocument(totalPages);
111+
for (PDPage page : pages) {
112+
if ((pageNumber - 1) % logFrequency == 0) {
113+
logger.info("Processing PDF page: {}", pageNumber);
114+
}
120115

121-
if (this.config.pagesPerDocument != PdfDocumentReaderConfig.ALL_PAGES
122-
&& pagesPerDocument >= this.config.pagesPerDocument) {
123-
pagesPerDocument = 0;
116+
handleSinglePage(page, pageNumber, pdfTextStripper, pageTextGroupList);
124117

125-
var aggregatedPageTextGroup = pageTextGroupList.stream().collect(Collectors.joining());
126-
if (StringUtils.hasText(aggregatedPageTextGroup)) {
127-
readDocuments.add(toDocument(page, aggregatedPageTextGroup, startPageNumber, pageNumber));
118+
if (pageNumber % pagesPerDocument == 0 || pageNumber == totalPages) {
119+
if (!CollectionUtils.isEmpty(pageTextGroupList)) {
120+
readDocuments.add(toDocument(pageTextGroupList.stream().collect(Collectors.joining()),
121+
startPageNumber, pageNumber));
122+
pageTextGroupList.clear();
128123
}
129-
pageTextGroupList.clear();
130-
131124
startPageNumber = pageNumber + 1;
132125
}
133-
int x0 = (int) page.getMediaBox().getLowerLeftX();
134-
int xW = (int) page.getMediaBox().getWidth();
135-
136-
int y0 = (int) page.getMediaBox().getLowerLeftY() + this.config.pageTopMargin;
137-
int yW = (int) page.getMediaBox().getHeight()
138-
- (this.config.pageTopMargin + this.config.pageBottomMargin);
139-
140-
pdfTextStripper.addRegion(PDF_PAGE_REGION, new Rectangle(x0, y0, xW, yW));
141-
pdfTextStripper.extractRegions(page);
142-
var pageText = pdfTextStripper.getTextForRegion(PDF_PAGE_REGION);
143126

144-
if (StringUtils.hasText(pageText)) {
145-
146-
pageText = this.config.pageExtractedTextFormatter.format(pageText, pageNumber);
147-
148-
pageTextGroupList.add(pageText);
149-
}
150127
pageNumber++;
151-
pdfTextStripper.removeRegion(PDF_PAGE_REGION);
152128
}
153-
if (!CollectionUtils.isEmpty(pageTextGroupList)) {
154-
readDocuments.add(toDocument(lastPage, pageTextGroupList.stream().collect(Collectors.joining()),
155-
startPageNumber, pageNumber));
156-
}
157-
logger.info("Processing {} pages", totalPages);
158-
return readDocuments;
159129

130+
logger.info("Processed total {} pages", totalPages);
131+
return readDocuments;
160132
}
161133
catch (IOException e) {
162134
throw new RuntimeException(e);
163135
}
164136
}
165137

166-
protected Document toDocument(PDPage page, String docText, int startPageNumber, int endPageNumber) {
138+
private void handleSinglePage(PDPage page, int pageNumber, PDFLayoutTextStripperByArea pdfTextStripper,
139+
List<String> pageTextGroupList) throws IOException {
140+
int x0 = (int) page.getMediaBox().getLowerLeftX();
141+
int xW = (int) page.getMediaBox().getWidth();
142+
143+
int y0 = (int) page.getMediaBox().getLowerLeftY() + this.config.pageTopMargin;
144+
int yW = (int) page.getMediaBox().getHeight() - (this.config.pageTopMargin + this.config.pageBottomMargin);
145+
146+
pdfTextStripper.addRegion(PDF_PAGE_REGION, new Rectangle(x0, y0, xW, yW));
147+
pdfTextStripper.extractRegions(page);
148+
var pageText = pdfTextStripper.getTextForRegion(PDF_PAGE_REGION);
149+
150+
if (StringUtils.hasText(pageText)) {
151+
pageText = this.config.pageExtractedTextFormatter.format(pageText, pageNumber);
152+
pageTextGroupList.add(pageText);
153+
}
154+
pdfTextStripper.removeRegion(PDF_PAGE_REGION);
155+
}
156+
157+
private int getPagesPerDocument(int totalPages) {
158+
if (this.config.pagesPerDocument == PdfDocumentReaderConfig.ALL_PAGES) {
159+
return totalPages;
160+
}
161+
return this.config.pagesPerDocument;
162+
}
163+
164+
protected Document toDocument(String docText, int startPageNumber, int endPageNumber) {
167165
Document doc = new Document(docText);
168166
doc.getMetadata().put(METADATA_START_PAGE_NUMBER, startPageNumber);
169167
if (startPageNumber != endPageNumber) {

document-readers/pdf-reader/src/test/java/org/springframework/ai/reader/pdf/PagePdfDocumentReaderTests.java

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
/**
3131
* @author Christian Tzolov
3232
* @author Tibor Tarnai
33+
* @author Fu Jian
3334
*/
3435
class PagePdfDocumentReaderTests {
3536

@@ -71,4 +72,43 @@ void testIndexOutOfBound() {
7172
assertThat(documents).hasSize(64);
7273
}
7374

75+
@Test
76+
void testPagesPerDocument() {
77+
// The test pdf contain 64 pages
78+
var documents = new PagePdfDocumentReader("classpath:/sample2.pdf",
79+
PdfDocumentReaderConfig.builder()
80+
.withPageExtractedTextFormatter(ExtractedTextFormatter.builder().build())
81+
.withPagesPerDocument(32)
82+
.build())
83+
.get();
84+
85+
assertThat(documents).hasSize(2);
86+
}
87+
88+
@Test
89+
void testPagesPerDocumentNotDivisible() {
90+
// The test pdf contain 64 pages
91+
var documents = new PagePdfDocumentReader("classpath:/sample2.pdf",
92+
PdfDocumentReaderConfig.builder()
93+
.withPageExtractedTextFormatter(ExtractedTextFormatter.builder().build())
94+
.withPagesPerDocument(3)
95+
.build())
96+
.get();
97+
98+
assertThat(documents).hasSize(22);
99+
}
100+
101+
@Test
102+
void testAllPagesPerDocument() {
103+
// The test pdf contain 64 pages
104+
var documents = new PagePdfDocumentReader("classpath:/sample2.pdf",
105+
PdfDocumentReaderConfig.builder()
106+
.withPageExtractedTextFormatter(ExtractedTextFormatter.builder().build())
107+
.withPagesPerDocument(0) // all pages into one document
108+
.build())
109+
.get();
110+
111+
assertThat(documents).hasSize(1);
112+
}
113+
74114
}

0 commit comments

Comments
 (0)