Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TEST: Patched test results for #749 #822

Merged
merged 7 commits into from
Jan 18, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions jhove-bbt/scripts/create-1.27-target.sh
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,16 @@ if [[ -f "${candidateRoot}/regression/modules/PDF-hul/pdf-hul-94-false-positive.
cp "${candidateRoot}/regression/modules/PDF-hul/pdf-hul-94-false-positive.pdf.jhove.xml" "${targetRoot}/regression/modules/PDF-hul/pdf-hul-94-false-positive.pdf.jhove.xml"
fi

# Copy Regression corpus results for files affected by fix for issue 672, filters as indirect objects
if [[ -f "${candidateRoot}/regression/modules/PDF-hul/pdf-hul-8-Secured.pdf.jhove.xml" ]]; then
echo " - Regression check for empty PDF string handling."
cp "${candidateRoot}/regression/modules/PDF-hul/pdf-hul-8-Secured.pdf.jhove.xml" "${targetRoot}/regression/modules/PDF-hul/pdf-hul-8-Secured.pdf.jhove.xml"
fi
if [[ -f "${candidateRoot}/regression/modules/PDF-hul/pdf-hul-11-govdocs-152588.pdf.jhove.xml" ]]; then
echo " - Regression check for empty PDF string handling."
cp "${candidateRoot}/regression/modules/PDF-hul/pdf-hul-11-govdocs-152588.pdf.jhove.xml" "${targetRoot}/regression/modules/PDF-hul/pdf-hul-11-govdocs-152588.pdf.jhove.xml"
fi

# Update release details for PDF module
find "${targetRoot}" -type f -name "*.jpg.jhove.xml" -exec sed -i 's/^ <reportingModule release="1.5.3" date="2022-04-22">JPEG-hul<\/reportingModule>$/ <reportingModule release="1.5.4" date="2023-01-31">JPEG-hul<\/reportingModule>/' {} \;
find "${targetRoot}" -type f -name "audit.jhove.xml" -exec sed -i 's/^ <module release="1.5.3">JPEG-hul<\/module>$/ <module release="1.5.4">JPEG-hul<\/module>/' {} \;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ public class PdfModule extends ModuleBase {
private static final String DICT_KEY_TRAILER = "trailer";
private static final String DICT_KEY_SIZE = "Size";
private static final String DICT_KEY_ENCRYPT = "Encrypt";
private static final String DICT_KEY_STMF = "StmF";
private static final String DICT_KEY_INFO = "Info";
private static final String DICT_KEY_ID = "ID";
private static final String DICT_KEY_FONT_NAME = "FontName";
Expand Down Expand Up @@ -441,15 +442,16 @@ public class PdfModule extends ModuleBase {
protected int[][] _xref2; // Array of int[2], giving object stream and
// offset when _xref[i] < 0
protected boolean _xrefIsStream; // True if XRef streams rather than tables
// are used
protected boolean _encrypted; // Equivalent to _encryptDictRef != null
protected List<Property> _docCatalogList; // Info extracted from doc cat dict
protected List<Property> _encryptList; // Info from encryption dict
protected List<Property> _docInfoList; // Info from doc info dict
protected List<Property> _extStreamsList; // List of external streams
protected List<Property> _imagesList; // List of image streams
protected List<Property> _filtersList; // List of filters
protected List<Property> _pagesList; // List of PageObjects
// are used
protected boolean _encrypted; // Equivalent to _encryptDictRef != null
protected boolean _streamsEncrypted; // streams are encrypted and can't be parsed.
protected List<Property> _docCatalogList; // Info extracted from doc cat dict
protected List<Property> _encryptList; // Info from encryption dict
protected List<Property> _docInfoList; // Info from doc info dict
protected List<Property> _extStreamsList; // List of external streams
protected List<Property> _imagesList; // List of image streams
protected List<Property> _filtersList; // List of filters
protected List<Property> _pagesList; // List of PageObjects

/** Map of Type 0 font dictionaries. */
protected Map<Integer, PdfObject> _type0FontsMap;
Expand Down Expand Up @@ -858,7 +860,7 @@ public final void parse(RandomAccessFile raf, RepInfo info)
return;
}
findExternalStreams(info);
if (!findFilters(info)) {
if (!findFilters(info) && !_streamsEncrypted) {
return;
}
findImages(info);
Expand Down Expand Up @@ -1277,6 +1279,11 @@ protected boolean parseTrailer(RepInfo info, boolean prevOnly)
throw new PdfInvalidException(MessageConstants.PDF_HUL_70, // PDF-HUL-70
_parser.getOffset());
}
// readEncryptDict is not enough to check encryption when exists.
_encryptDictRef = (PdfIndirectObj) dict.get(DICT_KEY_ENCRYPT);
if (_encryptDictRef != null) {
_encrypted = true;
}
/*
* We don't need to see a trailer dictionary.
* Move along, move along.
Expand Down Expand Up @@ -1655,10 +1662,12 @@ private boolean readDocCatalogDict(RepInfo info) throws IOException {
pModeText);
_docCatalogList.add(p);

PdfObject outlines = resolveIndirectObject(
_docCatDict.get(DICT_KEY_OUTLINES));
if (outlines instanceof PdfDictionary) {
_outlineDict = (PdfDictionary) outlines;
if (!_encrypted) {
PdfObject outlines = resolveIndirectObject(
_docCatDict.get(DICT_KEY_OUTLINES));
if (outlines instanceof PdfDictionary) {
_outlineDict = (PdfDictionary) outlines;
}
}

PdfObject lang = resolveIndirectObject(
Expand All @@ -1674,9 +1683,11 @@ private boolean readDocCatalogDict(RepInfo info) throws IOException {
// but this is a convenient time to grab it and the page label
// dictionary.
_pagesDictRef = (PdfIndirectObj) _docCatDict.get(DICT_KEY_PAGES);
_pageLabelDict = (PdfDictionary) resolveIndirectObject(
_docCatDict.get(DICT_KEY_PAGE_LABELS));

if (!_encrypted) {
_pageLabelDict = (PdfDictionary) resolveIndirectObject(
_docCatDict.get(DICT_KEY_PAGE_LABELS));
}

// Grab the Version entry, and use it to override the
// file header IF it's later.
PdfObject vers = resolveIndirectObject(
Expand Down Expand Up @@ -1760,8 +1771,11 @@ private boolean readDocCatalogDict(RepInfo info) throws IOException {
// Get the Names dictionary in order to grab the
// EmbeddedFiles and Dests entries.
try {
PdfDictionary namesDict = (PdfDictionary) resolveIndirectObject(
_docCatDict.get(DICT_KEY_NAMES));
PdfDictionary namesDict = null;
if (!_encrypted) {
namesDict = (PdfDictionary) resolveIndirectObject(
_docCatDict.get(DICT_KEY_NAMES));
}
if (namesDict != null) {
PdfDictionary embeddedDict = (PdfDictionary) resolveIndirectObject(
namesDict.get(DICT_KEY_EMBEDDED_FILES));
Expand Down Expand Up @@ -1967,6 +1981,10 @@ protected boolean readEncryptDict(RepInfo info) throws IOException {
PROP_NAME_STANDARD_SECURITY_HANDLER,
PropertyType.PROPERTY, PropertyArity.LIST, stdList));
}
PdfObject streamEncrypted = _encryptDict.get(DICT_KEY_STMF);
if (streamEncrypted instanceof PdfSimpleObject) {
_streamsEncrypted = true;
}

} catch (PdfException e) {
e.disparage(info);
Expand Down Expand Up @@ -2039,21 +2057,24 @@ protected boolean readDocumentTree(RepInfo info) {
}

PdfObject pagesObj = resolveIndirectObject(_pagesDictRef);
if (!(pagesObj instanceof PdfDictionary))
if (pagesObj != null && !(pagesObj instanceof PdfDictionary)) {
throw new PdfMalformedException(MessageConstants.PDF_HUL_97); // PDF-HUL-97
PdfDictionary pagesDict = (PdfDictionary) pagesObj;

// Check that the pages dict has a key type and the types value is
// Pages
if (!checkTypeKey(pagesDict, info, KEY_VAL_PAGES,
MessageConstants.PDF_HUL_146, // PDF-HUL-146
MessageConstants.PDF_HUL_144, // PDF-HUL-144
MessageConstants.PDF_HUL_145)) { // PDF-HUL-145
return false;
} else if (pagesObj != null) {

PdfDictionary pagesDict = (PdfDictionary) pagesObj;

// Check that the pages dict has a key type and the types value is
// Pages
if (!checkTypeKey(pagesDict, info, KEY_VAL_PAGES,
MessageConstants.PDF_HUL_146, // PDF-HUL-146
MessageConstants.PDF_HUL_144, // PDF-HUL-144
MessageConstants.PDF_HUL_145)) { // PDF-HUL-145
return false;
}

_docTreeRoot = new PageTreeNode(this, null, pagesDict);
_docTreeRoot.buildSubtree(true, MAX_PAGE_TREE_DEPTH);
}

_docTreeRoot = new PageTreeNode(this, null, pagesDict);
_docTreeRoot.buildSubtree(true, MAX_PAGE_TREE_DEPTH);
} catch (PdfException e) {
e.disparage(info);
if (e.getJhoveMessage() != null)
Expand Down Expand Up @@ -2317,6 +2338,10 @@ protected String extractFilters(Filter[] filters, PdfStream stream) {

protected void findImages(RepInfo info) throws IOException {
_imagesList = new LinkedList<Property>();
// needed if object streams are encrypted
if (_docTreeRoot == null) {
return;
}
_docTreeRoot.startWalk();
try {
for (;;) {
Expand Down Expand Up @@ -2590,6 +2615,10 @@ protected void findFonts(RepInfo info) throws IOException {
_type3FontsMap = new HashMap<Integer, PdfObject>();
_cid0FontsMap = new HashMap<Integer, PdfObject>();
_cid2FontsMap = new HashMap<Integer, PdfObject>();
//needed if object streams are encrypted
if (_docTreeRoot == null) {
return;
}
try {
_docTreeRoot.startWalk();
for (;;) {
Expand Down Expand Up @@ -2782,7 +2811,7 @@ protected PdfObject getObject(int objIndex, int recGuard)
return getObjectFromStream(objIndex, recGuard);
}
_parser.seek(offset);
PdfObject obj = _parser.readObjectDef();
PdfObject obj = _parser.readObjectDef(this);
//
// Experimental carl@openpreservation.org 2018-03-14
//
Expand Down Expand Up @@ -2971,6 +3000,10 @@ protected void addFontsProperty(List<Property> metadataList) {
protected void addPagesProperty(List<Property> metadataList, RepInfo info) {
_pagesList = new LinkedList<Property>();
_pageSeqMap = new HashMap<Integer, Integer>(500);
// needed if object streams are encrypted
if (_docTreeRoot == null) {
return;
}
try {
_docTreeRoot.startWalk();
int pageIndex = 0;
Expand Down Expand Up @@ -4485,29 +4518,34 @@ private PdfObject getObjectFromStream(final int objIndex,
int objStreamIndex = _xref2[objIndex][0];
PdfObject streamObj;
ObjectStream ostrm = null;
if (objStreamIndex == _cachedStreamIndex) {
ostrm = _cachedObjectStream;
// Reset it
if (ostrm.isValid()) {
ostrm.readIndex();
}
} else {
streamObj = resolveIndirectObject(
getObject(objStreamIndex, recGuard - 1));
if (streamObj instanceof PdfStream) {
ostrm = new ObjectStream((PdfStream) streamObj, _raf);
if (!_streamsEncrypted) {
if (objStreamIndex == _cachedStreamIndex) {
ostrm = _cachedObjectStream;
// Reset it
if (ostrm.isValid()) {
ostrm.readIndex();
_cachedObjectStream = ostrm;
_cachedStreamIndex = objStreamIndex;
} else {
throw new PdfMalformedException(
MessageConstants.PDF_HUL_108); // PDF-HUL-108
}
} else {
streamObj = resolveIndirectObject(
getObject(objStreamIndex, recGuard - 1));
if (streamObj instanceof PdfStream) {
ostrm = new ObjectStream((PdfStream) streamObj, _raf);
if (ostrm.isValid()) {
ostrm.readIndex();
_cachedObjectStream = ostrm;
_cachedStreamIndex = objStreamIndex;
} else {
throw new PdfMalformedException(
MessageConstants.PDF_HUL_108); // PDF-HUL-108
}
}
}
/* And finally extract the object from the object stream. */
return ostrm.getObject(objIndex);
}else {
return null;
}
/* And finally extract the object from the object stream. */
return ostrm.getObject(objIndex);

} catch (ZipException excep) {
_logger.info(excep.getMessage());
throw new PdfMalformedException(MessageConstants.PDF_HUL_109); // PDF-HUL-109
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

import edu.harvard.hul.ois.jhove.messages.JhoveMessage;
import edu.harvard.hul.ois.jhove.messages.JhoveMessages;
import edu.harvard.hul.ois.jhove.module.PdfModule;

/**
* The Parser class implements some limited syntactic analysis for PDF.
Expand All @@ -32,6 +33,7 @@ public class Parser
private Map<Long, PdfObject> _objectMap;
/** PDF/A compliance flag. */
private boolean _pdfACompliant;
protected PdfModule _module;


/**
Expand Down Expand Up @@ -241,7 +243,7 @@ public PdfObject readObjectDef (Numeric objNumTok)
if (strm != null) {
// Assimilate the dictionary and the stream token into the
// object to be returned
PdfStream strmObj = new PdfStream ((PdfDictionary) obj, strm);
PdfStream strmObj = new PdfStream ((PdfDictionary) obj, strm, _module);
if (!strmObj.isPdfaCompliant()) {
_pdfACompliant = false;
}
Expand Down Expand Up @@ -467,4 +469,9 @@ public void scanMode (boolean flag)
{
_tokenizer.scanMode (flag);
}

public PdfObject readObjectDef(PdfModule pdfModule) throws IOException, PdfException {
_module = pdfModule;
return this.readObjectDef();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

import java.util.*;

import edu.harvard.hul.ois.jhove.module.PdfModule;

/**
* A representation of a PDF stream object.
* A PdfStream consists of a dictionary and a stream token.
Expand All @@ -20,7 +22,7 @@ public class PdfStream extends PdfObject
private PdfDictionary _dict;
private Filter[] _filters;
private boolean pdfaCompliant;

protected PdfModule _module;
/**
* Creates a PdfStream
*
Expand All @@ -40,7 +42,6 @@ public PdfStream (PdfDictionary dict, Stream stream,
extractFilters ();
}


/**
* Creates a PdfStream.
*
Expand All @@ -57,6 +58,24 @@ public PdfStream (PdfDictionary dict, Stream stream)
extractFilters ();
}

/**
* Creates a PdfStream.
*
* @param dict A dictionary describing the stream
* @param stream A Stream token
* @param module Invoking the PdfModule
*/
public PdfStream (PdfDictionary dict, Stream stream, PdfModule module)
throws PdfException
{
super ();
_stream = stream;
_dict = dict;
pdfaCompliant = true; // assume compliance to start with
_module = module;
extractFilters ();
}


/**
* Returns the stream's dictionary
Expand Down Expand Up @@ -186,8 +205,17 @@ private void extractFilters () throws PdfException
else {
/* Only other allowed value is a string */
Filter[] val = new Filter[1];
val[0] = new Filter
(((PdfSimpleObject) filter).getStringValue());
if(filter instanceof PdfSimpleObject) {
val[0] = new Filter
(((PdfSimpleObject) filter).getStringValue());
} else {
System.out.println("we zijn er");
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry I left some debug code (System.out.println("we zijn er");). This should be deleted

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's the issue with bigger PRs and I let the dependency issue take my attention. I'll remove this and re-submit.

if (filter instanceof PdfIndirectObj) {
val[0] = new Filter
(((PdfSimpleObject) _module.resolveIndirectObject(filter)).getStringValue());
}
}

if (parms instanceof PdfDictionary) {
val[0].setDecodeParms((PdfDictionary) parms);
}
Expand Down