Skip to content

Commit

Permalink
PDF/UA. Disable ActualText processing for non-real content
Browse files Browse the repository at this point in the history
  • Loading branch information
MaximPlusov committed Mar 28, 2024
1 parent 7cf35ea commit 3c49028
Show file tree
Hide file tree
Showing 19 changed files with 77 additions and 45 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -98,11 +98,13 @@ public boolean isLastParsedContainsTransparency() {

public List<org.verapdf.model.operator.Operator> operatorsFromTokens(List<Object> rawTokens,
PDResourcesHandler resourcesHandler, GraphicState inheritedGraphicState,
StructureElementAccessObject structureElementAccessObject, COSObject parentStructElem, String parentsTags) {
StructureElementAccessObject structureElementAccessObject, COSObject parentStructElem,
String parentsTags, boolean isRealContent) {
List<org.verapdf.model.operator.Operator> result = new ArrayList<>();
List<COSBase> arguments = new ArrayList<>();
this.isLastParsedContainsTransparency = false;
OperatorParser parser = new OperatorParser(inheritedGraphicState, structureElementAccessObject, resourcesHandler, parentStructElem, parentsTags);
OperatorParser parser = new OperatorParser(inheritedGraphicState, structureElementAccessObject, resourcesHandler,
parentStructElem, parentsTags, isRealContent);

for (Object rawToken : rawTokens) {
if (rawToken instanceof COSBase) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,12 +94,15 @@ class OperatorParser {
private final TransparencyGraphicsState transparencyGraphicState = new TransparencyGraphicsState();
private final COSObject parentStructElem;
private final String parentsTags;

private final boolean isRealContent;

private boolean insideText = false;

OperatorParser(GraphicState inheritedGraphicState,
StructureElementAccessObject structureElementAccessObject,
PDResourcesHandler resourcesHandler, COSObject parentStructElem, String parentsTags) {
PDResourcesHandler resourcesHandler, COSObject parentStructElem,
String parentsTags, boolean isRealContent) {
if (inheritedGraphicState == null) {
this.graphicState = new GraphicState(resourcesHandler);
} else {
Expand All @@ -109,6 +112,7 @@ class OperatorParser {
this.structureElementAccessObject = structureElementAccessObject;
this.parentStructElem = parentStructElem;
this.parentsTags = parentsTags;
this.isRealContent = isRealContent;
}

public TransparencyGraphicsState getTransparencyGraphicState() {
Expand Down Expand Up @@ -155,7 +159,7 @@ void parseOperator(List<org.verapdf.model.operator.Operator> processedOperators,

// MARKED CONTENT
case Operators.BMC:
GFOp_BMC bmcOp = new GFOp_BMC(arguments, getCurrentMarkedContent(), parentsTags);
GFOp_BMC bmcOp = new GFOp_BMC(arguments, getCurrentMarkedContent(), parentsTags, isRealContent);
processedOperators.add(bmcOp);
this.markedContentStack.push(bmcOp);
break;
Expand All @@ -164,7 +168,7 @@ void parseOperator(List<org.verapdf.model.operator.Operator> processedOperators,
if (specification == PDFAFlavour.Specification.ISO_19005_3) {
checkAFKey(arguments, resourcesHandler);
}
GFOp_BDC bdcOp = new GFOp_BDC(arguments, resourcesHandler, getCurrentMarkedContent(), structureElementAccessObject, parentsTags);
GFOp_BDC bdcOp = new GFOp_BDC(arguments, resourcesHandler, getCurrentMarkedContent(), structureElementAccessObject, parentsTags, isRealContent);
processedOperators.add(bdcOp);
this.markedContentStack.push(bdcOp);
break;
Expand Down Expand Up @@ -318,25 +322,25 @@ void parseOperator(List<org.verapdf.model.operator.Operator> processedOperators,
// TEXT SHOW
case Operators.TJ_SHOW:
GFOp_Tj tj = new GFOp_Tj(arguments, this.graphicState.clone(),
resourcesHandler, getCurrentMarkedContent(), structureElementAccessObject);
resourcesHandler, getCurrentMarkedContent(), structureElementAccessObject, isRealContent);
addFontAndColorSpace(tj, this.transparencyGraphicState);
processedOperators.add(tj);
break;
case Operators.TJ_SHOW_POS:
GFOp_TJ_Big tjBig = new GFOp_TJ_Big(arguments, this.graphicState.clone(),
resourcesHandler, getCurrentMarkedContent(), structureElementAccessObject);
resourcesHandler, getCurrentMarkedContent(), structureElementAccessObject, isRealContent);
addFontAndColorSpace(tjBig, this.transparencyGraphicState);
processedOperators.add(tjBig);
break;
case Operators.QUOTE:
GFOp_Quote quote = new GFOp_Quote(arguments, this.graphicState.clone(),
resourcesHandler, getCurrentMarkedContent(), structureElementAccessObject);
resourcesHandler, getCurrentMarkedContent(), structureElementAccessObject, isRealContent);
addFontAndColorSpace(quote, this.transparencyGraphicState);
processedOperators.add(quote);
break;
case Operators.DOUBLE_QUOTE:
GFOp_DoubleQuote doubleQuote = new GFOp_DoubleQuote(arguments, this.graphicState.clone(),
resourcesHandler, getCurrentMarkedContent(), structureElementAccessObject);
resourcesHandler, getCurrentMarkedContent(), structureElementAccessObject, isRealContent);
addFontAndColorSpace(doubleQuote, this.transparencyGraphicState);
processedOperators.add(doubleQuote);
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,14 @@ public abstract class GFOpMarkedContent extends GFOperator implements OpMarkedCo
private COSDictionary propertiesDict;
private final GFOpMarkedContent markedContent;
private final String parentsTags;
private final boolean isRealContent;

public GFOpMarkedContent(List<COSBase> arguments, final String opType,
GFOpMarkedContent markedContent, String parentsTags) {
GFOpMarkedContent markedContent, String parentsTags, boolean isRealContent) {
super(arguments, opType);
this.markedContent = markedContent;
this.parentsTags = parentsTags;
this.isRealContent = isRealContent;
}

protected void initializePropertiesDict(PDResourcesHandler resources) {
Expand Down Expand Up @@ -225,7 +227,7 @@ public int hashCode() {

private List<CosActualText> getactualText() {
COSString actualText = getActualText();
if (actualText != null) {
if (isRealContent && actualText != null) {
List<CosActualText> list = new ArrayList<>(MAX_NUMBER_OF_ELEMENTS);
list.add(new GFCosActualText(actualText));
return list;
Expand All @@ -235,7 +237,7 @@ private List<CosActualText> getactualText() {

private List<CosAlt> getalt() {
COSString alt = getAlt();
if (alt != null) {
if (isRealContent && alt != null) {
List<CosAlt> list = new ArrayList<>(MAX_NUMBER_OF_ELEMENTS);
list.add(new GFCosAlt(alt));
return list;
Expand Down Expand Up @@ -273,4 +275,7 @@ public Long getInheritedMCID() {
return markedContent != null ? markedContent.getInheritedMCID() : null;
}

public boolean isRealContent() {
return isRealContent;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ public class GFOp_BDC extends GFOpMarkedContent implements Op_BDC {


public GFOp_BDC(List<COSBase> arguments, PDResourcesHandler resources, GFOpMarkedContent markedContent,
StructureElementAccessObject structureElementAccessObject, String parentsTags) {
super(arguments, OP_BDC_TYPE, markedContent, parentsTags);
StructureElementAccessObject structureElementAccessObject, String parentsTags, boolean isRealContent) {
super(arguments, OP_BDC_TYPE, markedContent, parentsTags, isRealContent);
initializePropertiesDict(resources);
this.structureElementAccessObject = structureElementAccessObject;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ public class GFOp_BMC extends GFOpMarkedContent implements Op_BMC {
/** Type name for {@code GFOp_BMC} */
public static final String OP_BMC_TYPE = "Op_BMC";

public GFOp_BMC(List<COSBase> arguments, GFOpMarkedContent markedContent, String parentsTags) {
super(arguments, OP_BMC_TYPE, markedContent, parentsTags);
public GFOp_BMC(List<COSBase> arguments, GFOpMarkedContent markedContent, String parentsTags, boolean isRealContent) {
super(arguments, OP_BMC_TYPE, markedContent, parentsTags, isRealContent);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public class GFOp_DP extends GFOpMarkedContent implements Op_DP {
public static final String OP_DP_TYPE = "Op_DP";

public GFOp_DP(List<COSBase> arguments, PDResourcesHandler resources) {
super(arguments, OP_DP_TYPE, null, "");
super(arguments, OP_DP_TYPE, null, "", false);
initializePropertiesDict(resources);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ public class GFOp_EMC extends GFOpMarkedContent implements Op_EMC {
public static final String OP_EMC_TYPE = "Op_EMC";

public GFOp_EMC(List<COSBase> arguments) {
super(arguments, OP_EMC_TYPE, null, "");
super(arguments, OP_EMC_TYPE, null, "", false);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ public class GFOp_MP extends GFOpMarkedContent implements Op_MP {
public static final String OP_MP_TYPE = "Op_MP";

public GFOp_MP(List<COSBase> arguments) {
super(arguments, OP_MP_TYPE, null, "");
super(arguments, OP_MP_TYPE, null, "", false);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ public class GFCIDGlyph extends GFGlyph implements CIDGlyph {

private final int cid;

protected GFCIDGlyph(PDFont font, int glyphCode, int renderingMode, String id,
GFOpMarkedContent markedContent, StructureElementAccessObject structureElementAccessObject) {
super(font, glyphCode, renderingMode, id, markedContent, structureElementAccessObject, CID_GLYPH_TYPE);
protected GFCIDGlyph(PDFont font, int glyphCode, int renderingMode, String id, GFOpMarkedContent markedContent,
StructureElementAccessObject structureElementAccessObject, boolean isRealContent) {
super(font, glyphCode, renderingMode, id, markedContent, structureElementAccessObject, isRealContent, CID_GLYPH_TYPE);
this.cid = ((PDType0Font) font).toCID(glyphCode);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,15 @@ public class GFGlyph extends GenericModelObject implements Glyph {
private final GFOpMarkedContent markedContent;
private final StructureElementAccessObject structureElementAccessObject;

protected GFGlyph(PDFont font, int glyphCode, int renderingMode, String id,
GFOpMarkedContent markedContent, StructureElementAccessObject structureElementAccessObject) {
this(font, glyphCode, renderingMode, id, markedContent, structureElementAccessObject, GLYPH_TYPE);
private final boolean isRealContent;

protected GFGlyph(PDFont font, int glyphCode, int renderingMode, String id, GFOpMarkedContent markedContent,
StructureElementAccessObject structureElementAccessObject, boolean isRealContent) {
this(font, glyphCode, renderingMode, id, markedContent, structureElementAccessObject, isRealContent, GLYPH_TYPE);
}

protected GFGlyph(PDFont font, int glyphCode, int renderingMode, String id,
GFOpMarkedContent markedContent, StructureElementAccessObject structureElementAccessObject, String type) {
GFOpMarkedContent markedContent, StructureElementAccessObject structureElementAccessObject, boolean isRealContent, String type) {
super(type);

FontProgram fontProgram = font.getFontProgram();
Expand Down Expand Up @@ -113,13 +115,14 @@ protected GFGlyph(PDFont font, int glyphCode, int renderingMode, String id,
this.toUnicode = font.toUnicode(glyphCode);
}
this.id = id;
this.isRealContent = isRealContent;
}

public static Glyph getGlyph(PDFont font, int glyphCode, int renderingMode, GFOpMarkedContent markedContent,
StructureElementAccessObject structureElementAccessObject) {
StructureElementAccessObject structureElementAccessObject, boolean isRealContent) {
String fontId = GFIDGenerator.generateID(font);
String id = GFIDGenerator.generateID(fontId,
font.getName(), glyphCode, renderingMode, markedContent, structureElementAccessObject);
font.getName(), glyphCode, renderingMode, markedContent, structureElementAccessObject, isRealContent);
Glyph cachedGlyph = null;
Map<String, Glyph> map = StaticContainers.getCachedGlyphs().get(fontId);
if (map != null) {
Expand All @@ -129,10 +132,10 @@ public static Glyph getGlyph(PDFont font, int glyphCode, int renderingMode, GFOp
if (font.getSubtype() == ASAtom.CID_FONT_TYPE0 || font.getSubtype() == ASAtom.CID_FONT_TYPE2 ||
font.getSubtype() == ASAtom.TYPE0) {
cachedGlyph = new GFCIDGlyph(font, glyphCode, renderingMode, id,
markedContent, structureElementAccessObject);
markedContent, structureElementAccessObject, isRealContent);
} else {
cachedGlyph = new GFGlyph(font, glyphCode, renderingMode, id,
markedContent, structureElementAccessObject, GLYPH_TYPE);
markedContent, structureElementAccessObject, isRealContent);
}
if (map == null) {
map = new HashMap<>();
Expand Down Expand Up @@ -254,4 +257,9 @@ public Boolean getactualTextPresent() {
public Boolean getaltPresent() {
return MarkedContentHelper.containsStringKey(ASAtom.ALT, markedContent, structureElementAccessObject);
}

@Override
public Boolean getisRealContent() {
return isRealContent;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ public abstract class GFOpStringTextShow extends GFOpTextShow {

protected GFOpStringTextShow(List<COSBase> arguments, GraphicState state,
PDResourcesHandler resources, GFOpMarkedContent markedContent,
StructureElementAccessObject structureElementAccessObject, final String opType) {
super(arguments, state, resources, markedContent, structureElementAccessObject, opType);
StructureElementAccessObject structureElementAccessObject, boolean isRealContent, final String opType) {
super(arguments, state, resources, markedContent, structureElementAccessObject, isRealContent, opType);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,12 @@ public abstract class GFOpTextShow extends GFOperator implements OpTextShow {
private List<PDFont> fonts = null;
private final org.verapdf.model.pdlayer.PDColorSpace fillCS;
private final org.verapdf.model.pdlayer.PDColorSpace strokeCS;

private final boolean isRealContent;

protected GFOpTextShow(List<COSBase> arguments, GraphicState state, PDResourcesHandler resourcesHandler,
GFOpMarkedContent markedContent, StructureElementAccessObject structureElementAccessObject,
final String opType) {
boolean isRealContent, final String opType) {
super(arguments, opType);
this.font = state.getFont();
this.scaleFactor = state.getScaleFactor();
Expand All @@ -97,6 +99,7 @@ protected GFOpTextShow(List<COSBase> arguments, GraphicState state, PDResourcesH
this.structureElementAccessObject = structureElementAccessObject;
this.fillCS = parseFillColorSpace();
this.strokeCS = parseStrokeColorSpace();
this.isRealContent = isRealContent;
}

@Override
Expand Down Expand Up @@ -150,7 +153,7 @@ private List<Glyph> getUsedGlyphs() {
while (inputStream.available() > 0) {
int code = font.readCode(inputStream);
Glyph glyph = GFGlyph.getGlyph(font, code, this.renderingMode.getValue(),
markedContent, structureElementAccessObject);
markedContent, structureElementAccessObject, isRealContent);
res.add(glyph);
}
} catch (IOException e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ public class GFOp_DoubleQuote extends GFOpStringTextShow implements Op_DoubleQuo
public static final int COUNT_OF_OPERATOR_OPERANDS = 3;

public GFOp_DoubleQuote(List<COSBase> arguments, GraphicState state, PDResourcesHandler resourcesHandler,
GFOpMarkedContent markedContent, StructureElementAccessObject structureElementAccessObject) {
super(arguments, state, resourcesHandler, markedContent, structureElementAccessObject, OP_DOUBLE_QUOTE_TYPE);
GFOpMarkedContent markedContent, StructureElementAccessObject structureElementAccessObject, boolean isRealContent) {
super(arguments, state, resourcesHandler, markedContent, structureElementAccessObject, isRealContent, OP_DOUBLE_QUOTE_TYPE);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ public class GFOp_Quote extends GFOpStringTextShow implements Op_Quote {
public static final String OP_QUOTE_TYPE = "Op_Quote";

public GFOp_Quote(List<COSBase> arguments, GraphicState state, PDResourcesHandler resourcesHandler,
GFOpMarkedContent markedContent, StructureElementAccessObject structureElementAccessObject) {
super(arguments, state, resourcesHandler, markedContent, structureElementAccessObject, OP_QUOTE_TYPE);
GFOpMarkedContent markedContent, StructureElementAccessObject structureElementAccessObject, boolean isRealContent) {
super(arguments, state, resourcesHandler, markedContent, structureElementAccessObject, isRealContent, OP_QUOTE_TYPE);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ public class GFOp_TJ_Big extends GFOpTextShow implements Op_TJ_Big {

public GFOp_TJ_Big(List<COSBase> arguments, GraphicState state,
PDResourcesHandler resourcesHandler, GFOpMarkedContent markedContent,
StructureElementAccessObject structureElementAccessObject) {
super(arguments, state, resourcesHandler, markedContent, structureElementAccessObject, OP_TJ_BIG_TYPE);
StructureElementAccessObject structureElementAccessObject, boolean isRealContent) {
super(arguments, state, resourcesHandler, markedContent, structureElementAccessObject, isRealContent, OP_TJ_BIG_TYPE);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ public class GFOp_Tj extends GFOpStringTextShow implements Op_Tj {

public GFOp_Tj(List<COSBase> arguments, GraphicState state,
PDResourcesHandler resourcesHandler, GFOpMarkedContent markedContent,
StructureElementAccessObject structureElementAccessObject) {
super(arguments, state, resourcesHandler, markedContent, structureElementAccessObject, OP_TJ_TYPE);
StructureElementAccessObject structureElementAccessObject, boolean isRealContent) {
super(arguments, state, resourcesHandler, markedContent, structureElementAccessObject, isRealContent, OP_TJ_TYPE);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,8 @@ protected void parseOperators() {
streamParser.parseTokens();
OperatorFactory operatorFactory = new OperatorFactory();
List<Operator> result = operatorFactory.operatorsFromTokens(streamParser.getTokens(),
resourcesHandler, inheritedGraphicState, structureElementAccessObject, parentStructElem, parentsTags);
resourcesHandler, inheritedGraphicState, structureElementAccessObject,
parentStructElem, parentsTags, isSemantic());
this.containsTransparency = operatorFactory.isLastParsedContainsTransparency();
this.operators = Collections.unmodifiableList(result);
} finally {
Expand Down Expand Up @@ -216,5 +217,9 @@ private List<PDResources> getResources() {
}
return result;
}

protected boolean isSemantic() {
return false;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -121,4 +121,9 @@ private List<SEContentItem> getContentItem() {
return Collections.unmodifiableList(list);
}

@Override
protected boolean isSemantic() {
return true;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,12 @@ public static String generateID(PDType0Font font) {

public static String generateID(String fontID, String fontName, int glyphCode, int renderingMode,
GFOpMarkedContent markedContent,
StructureElementAccessObject structureElementAccessObject) {
StructureElementAccessObject structureElementAccessObject, boolean isRealContent) {
String markedContentID = markedContent == null ? "" : String.valueOf(markedContent.hashCode());
String structureElementAccessID = structureElementAccessObject == null ? "" :
String.valueOf(structureElementAccessObject.hashCode());
return fontID + ' ' + fontName + ' ' + glyphCode + ' ' + renderingMode + ' ' +
markedContentID + ' ' + structureElementAccessID;
markedContentID + ' ' + structureElementAccessID + ' ' + isRealContent;
}

public static String generateID(PDFont rawFont, RenderingMode renderingMode) {
Expand Down

0 comments on commit 3c49028

Please sign in to comment.