Skip to content

Commit

Permalink
Merge pull request #822 from openpreserve/fix/indirect-filter
Browse files Browse the repository at this point in the history
TEST: Patched test results for #749
  • Loading branch information
carlwilson authored Jan 18, 2023
2 parents 44f67d0 + 7c36c50 commit 0ef4377
Show file tree
Hide file tree
Showing 4 changed files with 153 additions and 92 deletions.
10 changes: 10 additions & 0 deletions jhove-bbt/scripts/create-1.27-target.sh
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,16 @@ if [[ -f "${candidateRoot}/regression/modules/PDF-hul/pdf-hul-94-false-positive.
cp "${candidateRoot}/regression/modules/PDF-hul/pdf-hul-94-false-positive.pdf.jhove.xml" "${targetRoot}/regression/modules/PDF-hul/pdf-hul-94-false-positive.pdf.jhove.xml"
fi

# Copy Regression corpus results for files affected by fix for issue 672, filters as indirect objects
if [[ -f "${candidateRoot}/regression/modules/PDF-hul/pdf-hul-8-Secured.pdf.jhove.xml" ]]; then
echo " - Regression check for empty PDF string handling."
cp "${candidateRoot}/regression/modules/PDF-hul/pdf-hul-8-Secured.pdf.jhove.xml" "${targetRoot}/regression/modules/PDF-hul/pdf-hul-8-Secured.pdf.jhove.xml"
fi
if [[ -f "${candidateRoot}/regression/modules/PDF-hul/pdf-hul-11-govdocs-152588.pdf.jhove.xml" ]]; then
echo " - Regression check for empty PDF string handling."
cp "${candidateRoot}/regression/modules/PDF-hul/pdf-hul-11-govdocs-152588.pdf.jhove.xml" "${targetRoot}/regression/modules/PDF-hul/pdf-hul-11-govdocs-152588.pdf.jhove.xml"
fi

# Update release details for PDF module
find "${targetRoot}" -type f -name "*.jpg.jhove.xml" -exec sed -i 's/^ <reportingModule release="1.5.3" date="2022-04-22">JPEG-hul<\/reportingModule>$/ <reportingModule release="1.5.4" date="2023-01-31">JPEG-hul<\/reportingModule>/' {} \;
find "${targetRoot}" -type f -name "audit.jhove.xml" -exec sed -i 's/^ <module release="1.5.3">JPEG-hul<\/module>$/ <module release="1.5.4">JPEG-hul<\/module>/' {} \;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ public class PdfModule extends ModuleBase {
private static final String DICT_KEY_TRAILER = "trailer";
private static final String DICT_KEY_SIZE = "Size";
private static final String DICT_KEY_ENCRYPT = "Encrypt";
private static final String DICT_KEY_STMF = "StmF";
private static final String DICT_KEY_INFO = "Info";
private static final String DICT_KEY_ID = "ID";
private static final String DICT_KEY_FONT_NAME = "FontName";
Expand Down Expand Up @@ -441,15 +442,16 @@ public class PdfModule extends ModuleBase {
protected int[][] _xref2; // Array of int[2], giving object stream and
// offset when _xref[i] < 0
protected boolean _xrefIsStream; // True if XRef streams rather than tables
// are used
protected boolean _encrypted; // Equivalent to _encryptDictRef != null
protected List<Property> _docCatalogList; // Info extracted from doc cat dict
protected List<Property> _encryptList; // Info from encryption dict
protected List<Property> _docInfoList; // Info from doc info dict
protected List<Property> _extStreamsList; // List of external streams
protected List<Property> _imagesList; // List of image streams
protected List<Property> _filtersList; // List of filters
protected List<Property> _pagesList; // List of PageObjects
// are used
protected boolean _encrypted; // Equivalent to _encryptDictRef != null
protected boolean _streamsEncrypted; // streams are encrypted and can't be parsed.
protected List<Property> _docCatalogList; // Info extracted from doc cat dict
protected List<Property> _encryptList; // Info from encryption dict
protected List<Property> _docInfoList; // Info from doc info dict
protected List<Property> _extStreamsList; // List of external streams
protected List<Property> _imagesList; // List of image streams
protected List<Property> _filtersList; // List of filters
protected List<Property> _pagesList; // List of PageObjects

/** Map of Type 0 font dictionaries. */
protected Map<Integer, PdfObject> _type0FontsMap;
Expand Down Expand Up @@ -858,7 +860,7 @@ public final void parse(RandomAccessFile raf, RepInfo info)
return;
}
findExternalStreams(info);
if (!findFilters(info)) {
if (!findFilters(info) && !_streamsEncrypted) {
return;
}
findImages(info);
Expand Down Expand Up @@ -1277,6 +1279,11 @@ protected boolean parseTrailer(RepInfo info, boolean prevOnly)
throw new PdfInvalidException(MessageConstants.PDF_HUL_70, // PDF-HUL-70
_parser.getOffset());
}
// readEncryptDict is not enough to check encryption when exists.
_encryptDictRef = (PdfIndirectObj) dict.get(DICT_KEY_ENCRYPT);
if (_encryptDictRef != null) {
_encrypted = true;
}
/*
* We don't need to see a trailer dictionary.
* Move along, move along.
Expand Down Expand Up @@ -1655,10 +1662,12 @@ private boolean readDocCatalogDict(RepInfo info) throws IOException {
pModeText);
_docCatalogList.add(p);

PdfObject outlines = resolveIndirectObject(
_docCatDict.get(DICT_KEY_OUTLINES));
if (outlines instanceof PdfDictionary) {
_outlineDict = (PdfDictionary) outlines;
if (!_encrypted) {
PdfObject outlines = resolveIndirectObject(
_docCatDict.get(DICT_KEY_OUTLINES));
if (outlines instanceof PdfDictionary) {
_outlineDict = (PdfDictionary) outlines;
}
}

PdfObject lang = resolveIndirectObject(
Expand All @@ -1674,9 +1683,11 @@ private boolean readDocCatalogDict(RepInfo info) throws IOException {
// but this is a convenient time to grab it and the page label
// dictionary.
_pagesDictRef = (PdfIndirectObj) _docCatDict.get(DICT_KEY_PAGES);
_pageLabelDict = (PdfDictionary) resolveIndirectObject(
_docCatDict.get(DICT_KEY_PAGE_LABELS));

if (!_encrypted) {
_pageLabelDict = (PdfDictionary) resolveIndirectObject(
_docCatDict.get(DICT_KEY_PAGE_LABELS));
}

// Grab the Version entry, and use it to override the
// file header IF it's later.
PdfObject vers = resolveIndirectObject(
Expand Down Expand Up @@ -1760,8 +1771,11 @@ private boolean readDocCatalogDict(RepInfo info) throws IOException {
// Get the Names dictionary in order to grab the
// EmbeddedFiles and Dests entries.
try {
PdfDictionary namesDict = (PdfDictionary) resolveIndirectObject(
_docCatDict.get(DICT_KEY_NAMES));
PdfDictionary namesDict = null;
if (!_encrypted) {
namesDict = (PdfDictionary) resolveIndirectObject(
_docCatDict.get(DICT_KEY_NAMES));
}
if (namesDict != null) {
PdfDictionary embeddedDict = (PdfDictionary) resolveIndirectObject(
namesDict.get(DICT_KEY_EMBEDDED_FILES));
Expand Down Expand Up @@ -1967,6 +1981,10 @@ protected boolean readEncryptDict(RepInfo info) throws IOException {
PROP_NAME_STANDARD_SECURITY_HANDLER,
PropertyType.PROPERTY, PropertyArity.LIST, stdList));
}
PdfObject streamEncrypted = _encryptDict.get(DICT_KEY_STMF);
if (streamEncrypted instanceof PdfSimpleObject) {
_streamsEncrypted = true;
}

} catch (PdfException e) {
e.disparage(info);
Expand Down Expand Up @@ -2039,21 +2057,24 @@ protected boolean readDocumentTree(RepInfo info) {
}

PdfObject pagesObj = resolveIndirectObject(_pagesDictRef);
if (!(pagesObj instanceof PdfDictionary))
if (pagesObj != null && !(pagesObj instanceof PdfDictionary)) {
throw new PdfMalformedException(MessageConstants.PDF_HUL_97); // PDF-HUL-97
PdfDictionary pagesDict = (PdfDictionary) pagesObj;

// Check that the pages dict has a key type and the types value is
// Pages
if (!checkTypeKey(pagesDict, info, KEY_VAL_PAGES,
MessageConstants.PDF_HUL_146, // PDF-HUL-146
MessageConstants.PDF_HUL_144, // PDF-HUL-144
MessageConstants.PDF_HUL_145)) { // PDF-HUL-145
return false;
} else if (pagesObj != null) {

PdfDictionary pagesDict = (PdfDictionary) pagesObj;

// Check that the pages dict has a key type and the types value is
// Pages
if (!checkTypeKey(pagesDict, info, KEY_VAL_PAGES,
MessageConstants.PDF_HUL_146, // PDF-HUL-146
MessageConstants.PDF_HUL_144, // PDF-HUL-144
MessageConstants.PDF_HUL_145)) { // PDF-HUL-145
return false;
}

_docTreeRoot = new PageTreeNode(this, null, pagesDict);
_docTreeRoot.buildSubtree(true, MAX_PAGE_TREE_DEPTH);
}

_docTreeRoot = new PageTreeNode(this, null, pagesDict);
_docTreeRoot.buildSubtree(true, MAX_PAGE_TREE_DEPTH);
} catch (PdfException e) {
e.disparage(info);
if (e.getJhoveMessage() != null)
Expand Down Expand Up @@ -2317,6 +2338,10 @@ protected String extractFilters(Filter[] filters, PdfStream stream) {

protected void findImages(RepInfo info) throws IOException {
_imagesList = new LinkedList<Property>();
// needed if object streams are encrypted
if (_docTreeRoot == null) {
return;
}
_docTreeRoot.startWalk();
try {
for (;;) {
Expand Down Expand Up @@ -2590,6 +2615,10 @@ protected void findFonts(RepInfo info) throws IOException {
_type3FontsMap = new HashMap<Integer, PdfObject>();
_cid0FontsMap = new HashMap<Integer, PdfObject>();
_cid2FontsMap = new HashMap<Integer, PdfObject>();
//needed if object streams are encrypted
if (_docTreeRoot == null) {
return;
}
try {
_docTreeRoot.startWalk();
for (;;) {
Expand Down Expand Up @@ -2782,7 +2811,7 @@ protected PdfObject getObject(int objIndex, int recGuard)
return getObjectFromStream(objIndex, recGuard);
}
_parser.seek(offset);
PdfObject obj = _parser.readObjectDef();
PdfObject obj = _parser.readObjectDef(this);
//
// Experimental carl@openpreservation.org 2018-03-14
//
Expand Down Expand Up @@ -2971,6 +3000,10 @@ protected void addFontsProperty(List<Property> metadataList) {
protected void addPagesProperty(List<Property> metadataList, RepInfo info) {
_pagesList = new LinkedList<Property>();
_pageSeqMap = new HashMap<Integer, Integer>(500);
// needed if object streams are encrypted
if (_docTreeRoot == null) {
return;
}
try {
_docTreeRoot.startWalk();
int pageIndex = 0;
Expand Down Expand Up @@ -4485,29 +4518,34 @@ private PdfObject getObjectFromStream(final int objIndex,
int objStreamIndex = _xref2[objIndex][0];
PdfObject streamObj;
ObjectStream ostrm = null;
if (objStreamIndex == _cachedStreamIndex) {
ostrm = _cachedObjectStream;
// Reset it
if (ostrm.isValid()) {
ostrm.readIndex();
}
} else {
streamObj = resolveIndirectObject(
getObject(objStreamIndex, recGuard - 1));
if (streamObj instanceof PdfStream) {
ostrm = new ObjectStream((PdfStream) streamObj, _raf);
if (!_streamsEncrypted) {
if (objStreamIndex == _cachedStreamIndex) {
ostrm = _cachedObjectStream;
// Reset it
if (ostrm.isValid()) {
ostrm.readIndex();
_cachedObjectStream = ostrm;
_cachedStreamIndex = objStreamIndex;
} else {
throw new PdfMalformedException(
MessageConstants.PDF_HUL_108); // PDF-HUL-108
}
} else {
streamObj = resolveIndirectObject(
getObject(objStreamIndex, recGuard - 1));
if (streamObj instanceof PdfStream) {
ostrm = new ObjectStream((PdfStream) streamObj, _raf);
if (ostrm.isValid()) {
ostrm.readIndex();
_cachedObjectStream = ostrm;
_cachedStreamIndex = objStreamIndex;
} else {
throw new PdfMalformedException(
MessageConstants.PDF_HUL_108); // PDF-HUL-108
}
}
}
/* And finally extract the object from the object stream. */
return ostrm.getObject(objIndex);
}else {
return null;
}
/* And finally extract the object from the object stream. */
return ostrm.getObject(objIndex);

} catch (ZipException excep) {
_logger.info(excep.getMessage());
throw new PdfMalformedException(MessageConstants.PDF_HUL_109); // PDF-HUL-109
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

import edu.harvard.hul.ois.jhove.messages.JhoveMessage;
import edu.harvard.hul.ois.jhove.messages.JhoveMessages;
import edu.harvard.hul.ois.jhove.module.PdfModule;

/**
* The Parser class implements some limited syntactic analysis for PDF.
Expand All @@ -32,6 +33,7 @@ public class Parser
private Map<Long, PdfObject> _objectMap;
/** PDF/A compliance flag. */
private boolean _pdfACompliant;
protected PdfModule _module;


/**
Expand Down Expand Up @@ -241,7 +243,7 @@ public PdfObject readObjectDef (Numeric objNumTok)
if (strm != null) {
// Assimilate the dictionary and the stream token into the
// object to be returned
PdfStream strmObj = new PdfStream ((PdfDictionary) obj, strm);
PdfStream strmObj = new PdfStream ((PdfDictionary) obj, strm, _module);
if (!strmObj.isPdfaCompliant()) {
_pdfACompliant = false;
}
Expand Down Expand Up @@ -467,4 +469,9 @@ public void scanMode (boolean flag)
{
_tokenizer.scanMode (flag);
}

public PdfObject readObjectDef(PdfModule pdfModule) throws IOException, PdfException {
_module = pdfModule;
return this.readObjectDef();
}
}
Loading

0 comments on commit 0ef4377

Please sign in to comment.