Skip to content

Commit

Permalink
Migrate to PDFBox 3.0.0-RC1
Browse files Browse the repository at this point in the history
  • Loading branch information
jazzido committed Sep 5, 2023
1 parent 8bfa3ad commit 8fe2837
Show file tree
Hide file tree
Showing 8 changed files with 19 additions and 27 deletions.
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.28</version>
<version>3.0.0</version>
</dependency>

<dependency>
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/technology/tabula/CommandLineApp.java
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ public void extractFileInto(File pdfFile, File outputFile) throws ParseException
private void extractFile(File pdfFile, Appendable outFile) throws ParseException {
PDDocument pdfDocument = null;
try {
pdfDocument = this.password == null ? PDDocument.load(pdfFile) : PDDocument.load(pdfFile, this.password);
pdfDocument = this.password == null ? org.apache.pdfbox.Loader.loadPDF(pdfFile) : org.apache.pdfbox.Loader.loadPDF(pdfFile, this.password);
PageIterator pageIterator = getPageIterator(pdfDocument);
List<Table> tables = new ArrayList<>();

Expand Down
4 changes: 2 additions & 2 deletions src/main/java/technology/tabula/debug/Debug.java
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ public static void renderPage(String pdfPath, String outPath, int pageNumber, Re
boolean drawColumns, boolean drawCharacters, boolean drawArea, boolean drawCells,
boolean drawUnprocessedRulings, boolean drawProjectionProfile, boolean drawClippingPaths,
boolean drawDetectedTables) throws IOException {
PDDocument document = PDDocument.load(new File(pdfPath));
PDDocument document = org.apache.pdfbox.Loader.loadPDF(new File(pdfPath));

ObjectExtractor oe = new ObjectExtractor(document);

Expand Down Expand Up @@ -349,7 +349,7 @@ public static void main(String[] args) throws IOException {

if (pages == null) {
// user specified all pages
PDDocument document = PDDocument.load(pdfFile);
PDDocument document = org.apache.pdfbox.Loader.loadPDF(pdfFile);

int numPages = document.getNumberOfPages();
pages = new ArrayList<>(numPages);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -804,8 +804,7 @@ private List<Ruling> getVerticalRulings(BufferedImage image) {
private PDDocument removeText(PDPage page) throws IOException {

PDFStreamParser parser = new PDFStreamParser(page);
parser.parse();
List<Object> tokens = parser.getTokens();
List<Object> tokens = parser.parse();
List<Object> newTokens = new ArrayList<>();
for (Object token : tokens) {
if (token instanceof Operator) {
Expand Down
27 changes: 10 additions & 17 deletions src/test/java/technology/tabula/TestObjectExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,24 +12,17 @@

public class TestObjectExtractor {

/*@Test(expected=IOException.class)
public void testWrongPasswordRaisesException() throws IOException {
PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/encrypted.pdf"));
ObjectExtractor oe = new ObjectExtractor(pdf_document, "wrongpass");
oe.extract().next();
}*/

@Test(expected = IOException.class)
public void testEmptyOnEncryptedFileRaisesException() throws IOException {
PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/encrypted.pdf"));
PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/encrypted.pdf"));
try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) {
oe.extract().next();
}
}

@Test
public void testCanReadPDFWithOwnerEncryption() throws IOException {
PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf"));
PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf"));
try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) {
PageIterator pi = oe.extract();
int i = 0;
Expand All @@ -44,7 +37,7 @@ public void testCanReadPDFWithOwnerEncryption() throws IOException {

@Test
public void testGoodPassword() throws IOException {
PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/encrypted.pdf"), "userpassword");
PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/encrypted.pdf"), "userpassword");
try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) {
List<Page> pages = new ArrayList<>();
PageIterator pi = oe.extract();
Expand All @@ -58,7 +51,7 @@ public void testGoodPassword() throws IOException {

@Test
public void testTextExtractionDoesNotRaise() throws IOException {
PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/rotated_page.pdf"));
PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/rotated_page.pdf"));
try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) {
PageIterator pi = oe.extract();

Expand All @@ -70,7 +63,7 @@ public void testTextExtractionDoesNotRaise() throws IOException {

@Test
public void testShouldDetectRulings() throws IOException {
PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/should_detect_rulings.pdf"));
PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/should_detect_rulings.pdf"));
try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) {
PageIterator pi = oe.extract();

Expand All @@ -85,7 +78,7 @@ public void testShouldDetectRulings() throws IOException {

@Test
public void testDontThrowNPEInShfill() throws IOException {
PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/labor.pdf"));
PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/labor.pdf"));

try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) {
PageIterator pi = oe.extract();
Expand All @@ -101,7 +94,7 @@ public void testDontThrowNPEInShfill() throws IOException {

@Test
public void testExtractOnePage() throws IOException {
PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf"));
PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf"));
assertEquals(2, pdf_document.getNumberOfPages());

try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) {
Expand All @@ -114,7 +107,7 @@ public void testExtractOnePage() throws IOException {

@Test(expected = IndexOutOfBoundsException.class)
public void testExtractWrongPageNumber() throws IOException {
PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf"));
PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf"));
assertEquals(2, pdf_document.getNumberOfPages());

try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) {
Expand All @@ -124,7 +117,7 @@ public void testExtractWrongPageNumber() throws IOException {

@Test
public void testTextElementsContainedInPage() throws IOException {
PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/cs-en-us-pbms.pdf"));
PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/cs-en-us-pbms.pdf"));

try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) {
Page page = oe.extractPage(1);
Expand All @@ -137,7 +130,7 @@ public void testTextElementsContainedInPage() throws IOException {
}

@Test public void testDoNotNPEInPointComparator() throws IOException {
PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/npe_issue_206.pdf"));
PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/npe_issue_206.pdf"));

try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) {
Page p = oe.extractPage(1);
Expand Down
2 changes: 1 addition & 1 deletion src/test/java/technology/tabula/TestTableDetection.java
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ public void testDetectionOfTables() throws Exception {
NodeList tables = regionDocument.getElementsByTagName("table");

// tabula extractors
PDDocument pdfDocument = PDDocument.load(this.pdf);
PDDocument pdfDocument = org.apache.pdfbox.Loader.loadPDF(this.pdf);
ObjectExtractor extractor = new ObjectExtractor(pdfDocument);

// parse expected tables from the ground truth dataset
Expand Down
2 changes: 1 addition & 1 deletion src/test/java/technology/tabula/TestUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ public void testQuickSortLongList() {

@Test
public void testJPEG2000DoesNotRaise() throws IOException {
PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/jpeg2000.pdf"));
PDDocument pdf_document = org.apache.pdfbox.Loader.loadPDF(new File("src/test/resources/technology/tabula/jpeg2000.pdf"));
PDPage page = pdf_document.getPage(0);
Utils.pageConvertToImage(pdf_document, page, 360, ImageType.RGB);
}
Expand Down
4 changes: 2 additions & 2 deletions src/test/java/technology/tabula/UtilsForTesting.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ public static Page getAreaFromPage(String path, int page, float top, float left,
public static Page getPage(String path, int pageNumber) throws IOException {
ObjectExtractor oe = null;
try {
PDDocument document = PDDocument
.load(new File(path));
PDDocument document = org.apache.pdfbox.Loader
.loadPDF(new File(path));
oe = new ObjectExtractor(document);
Page page = oe.extract(pageNumber);
return page;
Expand Down

0 comments on commit 8fe2837

Please sign in to comment.