|
25 | 25 | import org.apache.pdfbox.pdfparser.PDFParser; |
26 | 26 | import org.apache.pdfbox.pdmodel.PDDocument; |
27 | 27 | import org.apache.pdfbox.pdmodel.PDPage; |
| 28 | +import org.apache.pdfbox.pdmodel.PDPageTree; |
28 | 29 | import org.slf4j.Logger; |
29 | 30 | import org.slf4j.LoggerFactory; |
30 | 31 |
|
|
44 | 45 | * pageBottomMargin = 0 |
45 | 46 | * |
46 | 47 | * @author Christian Tzolov |
| 48 | + * @author Fu Jian |
47 | 49 | */ |
48 | 50 | public class PagePdfDocumentReader implements DocumentReader { |
49 | 51 |
|
@@ -96,74 +98,70 @@ public List<Document> get() { |
96 | 98 | try { |
97 | 99 | var pdfTextStripper = new PDFLayoutTextStripperByArea(); |
98 | 100 |
|
99 | | - int pageNumber = 0; |
100 | | - int pagesPerDocument = 0; |
101 | | - int startPageNumber = pageNumber; |
| 101 | + int pageNumber = 1; |
| 102 | + int startPageNumber = 1; |
102 | 103 |
|
103 | 104 | List<String> pageTextGroupList = new ArrayList<>(); |
104 | 105 |
|
105 | | - int totalPages = this.document.getDocumentCatalog().getPages().getCount(); |
106 | | - int logFrequency = totalPages > 10 ? totalPages / 10 : 1; // if less than 10 |
107 | | - // pages, print |
108 | | - // each iteration |
109 | | - int counter = 0; |
110 | | - |
111 | | - PDPage lastPage = this.document.getDocumentCatalog().getPages().iterator().next(); |
112 | | - for (PDPage page : this.document.getDocumentCatalog().getPages()) { |
113 | | - lastPage = page; |
114 | | - if (counter % logFrequency == 0 && counter / logFrequency < 10) { |
115 | | - logger.info("Processing PDF page: {}", (counter + 1)); |
116 | | - } |
117 | | - counter++; |
| 106 | + PDPageTree pages = this.document.getDocumentCatalog().getPages(); |
| 107 | + int totalPages = pages.getCount(); |
| 108 | + int logFrequency = totalPages > 10 ? totalPages / 10 : 1; |
118 | 109 |
|
119 | | - pagesPerDocument++; |
| 110 | + int pagesPerDocument = getPagesPerDocument(totalPages); |
| 111 | + for (PDPage page : pages) { |
| 112 | + if ((pageNumber - 1) % logFrequency == 0) { |
| 113 | + logger.info("Processing PDF page: {}", pageNumber); |
| 114 | + } |
120 | 115 |
|
121 | | - if (this.config.pagesPerDocument != PdfDocumentReaderConfig.ALL_PAGES |
122 | | - && pagesPerDocument >= this.config.pagesPerDocument) { |
123 | | - pagesPerDocument = 0; |
| 116 | + handleSinglePage(page, pageNumber, pdfTextStripper, pageTextGroupList); |
124 | 117 |
|
125 | | - var aggregatedPageTextGroup = pageTextGroupList.stream().collect(Collectors.joining()); |
126 | | - if (StringUtils.hasText(aggregatedPageTextGroup)) { |
127 | | - readDocuments.add(toDocument(page, aggregatedPageTextGroup, startPageNumber, pageNumber)); |
| 118 | + if (pageNumber % pagesPerDocument == 0 || pageNumber == totalPages) { |
| 119 | + if (!CollectionUtils.isEmpty(pageTextGroupList)) { |
| 120 | + readDocuments.add(toDocument(pageTextGroupList.stream().collect(Collectors.joining()), |
| 121 | + startPageNumber, pageNumber)); |
| 122 | + pageTextGroupList.clear(); |
128 | 123 | } |
129 | | - pageTextGroupList.clear(); |
130 | | - |
131 | 124 | startPageNumber = pageNumber + 1; |
132 | 125 | } |
133 | | - int x0 = (int) page.getMediaBox().getLowerLeftX(); |
134 | | - int xW = (int) page.getMediaBox().getWidth(); |
135 | | - |
136 | | - int y0 = (int) page.getMediaBox().getLowerLeftY() + this.config.pageTopMargin; |
137 | | - int yW = (int) page.getMediaBox().getHeight() |
138 | | - - (this.config.pageTopMargin + this.config.pageBottomMargin); |
139 | | - |
140 | | - pdfTextStripper.addRegion(PDF_PAGE_REGION, new Rectangle(x0, y0, xW, yW)); |
141 | | - pdfTextStripper.extractRegions(page); |
142 | | - var pageText = pdfTextStripper.getTextForRegion(PDF_PAGE_REGION); |
143 | 126 |
|
144 | | - if (StringUtils.hasText(pageText)) { |
145 | | - |
146 | | - pageText = this.config.pageExtractedTextFormatter.format(pageText, pageNumber); |
147 | | - |
148 | | - pageTextGroupList.add(pageText); |
149 | | - } |
150 | 127 | pageNumber++; |
151 | | - pdfTextStripper.removeRegion(PDF_PAGE_REGION); |
152 | 128 | } |
153 | | - if (!CollectionUtils.isEmpty(pageTextGroupList)) { |
154 | | - readDocuments.add(toDocument(lastPage, pageTextGroupList.stream().collect(Collectors.joining()), |
155 | | - startPageNumber, pageNumber)); |
156 | | - } |
157 | | - logger.info("Processing {} pages", totalPages); |
158 | | - return readDocuments; |
159 | 129 |
|
| 130 | + logger.info("Processed total {} pages", totalPages); |
| 131 | + return readDocuments; |
160 | 132 | } |
161 | 133 | catch (IOException e) { |
162 | 134 | throw new RuntimeException(e); |
163 | 135 | } |
164 | 136 | } |
165 | 137 |
|
166 | | - protected Document toDocument(PDPage page, String docText, int startPageNumber, int endPageNumber) { |
| 138 | + private void handleSinglePage(PDPage page, int pageNumber, PDFLayoutTextStripperByArea pdfTextStripper, |
| 139 | + List<String> pageTextGroupList) throws IOException { |
| 140 | + int x0 = (int) page.getMediaBox().getLowerLeftX(); |
| 141 | + int xW = (int) page.getMediaBox().getWidth(); |
| 142 | + |
| 143 | + int y0 = (int) page.getMediaBox().getLowerLeftY() + this.config.pageTopMargin; |
| 144 | + int yW = (int) page.getMediaBox().getHeight() - (this.config.pageTopMargin + this.config.pageBottomMargin); |
| 145 | + |
| 146 | + pdfTextStripper.addRegion(PDF_PAGE_REGION, new Rectangle(x0, y0, xW, yW)); |
| 147 | + pdfTextStripper.extractRegions(page); |
| 148 | + var pageText = pdfTextStripper.getTextForRegion(PDF_PAGE_REGION); |
| 149 | + |
| 150 | + if (StringUtils.hasText(pageText)) { |
| 151 | + pageText = this.config.pageExtractedTextFormatter.format(pageText, pageNumber); |
| 152 | + pageTextGroupList.add(pageText); |
| 153 | + } |
| 154 | + pdfTextStripper.removeRegion(PDF_PAGE_REGION); |
| 155 | + } |
| 156 | + |
| 157 | + private int getPagesPerDocument(int totalPages) { |
| 158 | + if (this.config.pagesPerDocument == PdfDocumentReaderConfig.ALL_PAGES) { |
| 159 | + return totalPages; |
| 160 | + } |
| 161 | + return this.config.pagesPerDocument; |
| 162 | + } |
| 163 | + |
| 164 | + protected Document toDocument(String docText, int startPageNumber, int endPageNumber) { |
167 | 165 | Document doc = new Document(docText); |
168 | 166 | doc.getMetadata().put(METADATA_START_PAGE_NUMBER, startPageNumber); |
169 | 167 | if (startPageNumber != endPageNumber) { |
|
0 commit comments