Skip to content

Commit

Permalink
TIKA-3091 prevent npe in PDFParserConfig by initializing
Browse files Browse the repository at this point in the history
three parameters with default values.

# Conflicts:
#	tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
  • Loading branch information
tballison committed Apr 14, 2020
1 parent 86cdbb1 commit f51ae0a
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,17 @@ void setExtractInlineImages(boolean extractInlineImages) {
defaultConfig.setExtractInlineImages(extractInlineImages);
}

@Field
void setAverageCharTolerance(float averageCharTolerance) {
defaultConfig.setAverageCharTolerance(averageCharTolerance);
}

@Field
void setSpacingTolerance(float spacingTolerance) {
defaultConfig.setSpacingTolerance(spacingTolerance);
}


@Field
void setCatchIntermediateExceptions(boolean catchIntermediateExceptions) {
defaultConfig.setCatchIntermediateIOExceptions(catchIntermediateExceptions);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,10 +114,16 @@ private static OCR_STRATEGY parse(String s) {
private boolean extractMarkedContent = false;

//The character width-based tolerance value used to estimate where spaces in text should be added
private Float averageCharTolerance;
//Default taken from PDFBox.
private Float averageCharTolerance = 0.5f;

//The space width-based tolerance value used to estimate where spaces in text should be added
private Float spacingTolerance;
//Default taken from PDFBox.
private Float spacingTolerance = 0.3f;

// The multiplication factor for line height to decide when a new paragraph starts.
//Default taken from PDFBox.
private Float dropThreshold = 2.5f;

//If the PDF has an XFA element, process only that and skip extracting
//content from elsewhere in the document.
Expand Down Expand Up @@ -238,6 +244,10 @@ private void init(InputStream is) {

setSetKCMS(getBooleanProp(props.getProperty("setKCMS"), false));

setAverageCharTolerance(getFloatProp(props.getProperty("averageCharTolerance"), averageCharTolerance));
setSpacingTolerance(getFloatProp(props.getProperty("spacingTolerance"), spacingTolerance));
setDropThreshold(getFloatProp(props.getProperty("dropThreshold"), dropThreshold));

boolean checkExtractAccessPermission = getBooleanProp(props.getProperty("checkExtractAccessPermission"), false);
boolean allowExtractionForAccessibility = getBooleanProp(props.getProperty("allowExtractionForAccessibility"), true);

Expand Down Expand Up @@ -287,6 +297,9 @@ public void configure(PDF2XHTML pdf2XHTML) {
if (getSpacingTolerance() != null) {
pdf2XHTML.setSpacingTolerance(getSpacingTolerance());
}
if (getDropThreshold() != null) {
pdf2XHTML.setDropThreshold(dropThreshold);
}
pdf2XHTML.setSuppressDuplicateOverlappingText(getSuppressDuplicateOverlappingText());
}

Expand Down Expand Up @@ -513,6 +526,14 @@ public void setSpacingTolerance(Float spacingTolerance) {
this.spacingTolerance = spacingTolerance;
}

public Float getDropThreshold() {
return dropThreshold;
}

public void setDropThreshold(float dropThreshold) {
this.dropThreshold = dropThreshold;
}

public AccessChecker getAccessChecker() {
return accessChecker;
}
Expand Down Expand Up @@ -824,6 +845,7 @@ public boolean equals(Object o) {
if (getCatchIntermediateIOExceptions() != config.getCatchIntermediateIOExceptions()) return false;
if (!getAverageCharTolerance().equals(config.getAverageCharTolerance())) return false;
if (!getSpacingTolerance().equals(config.getSpacingTolerance())) return false;
if (!getDropThreshold().equals(config.getDropThreshold())) return false;
if (!getOcrStrategy().equals(config.getOcrStrategy())) return false;
if (getOcrImageType() != config.getOcrImageType()) return false;
if (!getOcrImageFormatName().equals(config.getOcrImageFormatName())) return false;
Expand All @@ -844,6 +866,7 @@ public int hashCode() {
result = 31 * result + (getExtractUniqueInlineImagesOnly() ? 1 : 0);
result = 31 * result + getAverageCharTolerance().hashCode();
result = 31 * result + getSpacingTolerance().hashCode();
result = 31 * result + getDropThreshold().hashCode();
result = 31 * result + (getIfXFAExtractOnlyXFA() ? 1 : 0);
result = 31 * result + ocrStrategy.hashCode();
result = 31 * result + getOcrDPI();
Expand All @@ -869,6 +892,7 @@ public String toString() {
", extractUniqueInlineImagesOnly=" + extractUniqueInlineImagesOnly +
", averageCharTolerance=" + averageCharTolerance +
", spacingTolerance=" + spacingTolerance +
", dropThreshold=" + dropThreshold +
", ifXFAExtractOnlyXFA=" + ifXFAExtractOnlyXFA +
", ocrStrategy=" + ocrStrategy +
", ocrDPI=" + ocrDPI +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1528,6 +1528,16 @@ public void testUnmappedUnicodeStats() throws Exception {

}

@Test
public void testNPEInPDFParserConfig() {
//TIKA-3091
PDFParserConfig config = new PDFParserConfig();
//don't care about values; want to make sure no NPE is thrown
String txt = config.toString();
config.hashCode();
config.equals(new PDFParserConfig());
}

@Test //TIKA-3041
@Ignore("turn back on if we add file from PDFBOX-52")
public void testPDFBox52() throws Exception {
Expand Down

0 comments on commit f51ae0a

Please sign in to comment.