From 35bf7bdc5a6a952a9090cc477fd5afb8609a9ab0 Mon Sep 17 00:00:00 2001 From: danfickle Date: Fri, 18 Jan 2019 17:18:35 +1100 Subject: [PATCH] #79 - Even more robust parent/child relationships based on box tree instead of DOM elements. Also started to use correct tags. --- .../java/com/openhtmltopdf/render/Box.java | 10 ++ .../pdfboxout/PdfBoxAccessibilityHelper.java | 158 +++++++++++------- .../pdfboxout/PdfBoxFastOutputDevice.java | 5 +- 3 files changed, 107 insertions(+), 66 deletions(-) diff --git a/openhtmltopdf-core/src/main/java/com/openhtmltopdf/render/Box.java b/openhtmltopdf-core/src/main/java/com/openhtmltopdf/render/Box.java index b8a039eb6..94a84ef53 100755 --- a/openhtmltopdf-core/src/main/java/com/openhtmltopdf/render/Box.java +++ b/openhtmltopdf-core/src/main/java/com/openhtmltopdf/render/Box.java @@ -101,6 +101,8 @@ public abstract class Box implements Styleable, DisplayListItem { private Area _absoluteClipBox; private boolean _clipBoxCalculated = false; + private Object _accessibilityObject; + protected Box() { } @@ -552,6 +554,14 @@ public boolean hasNonTextContent(CssContext c) { return false; } + + public void setAccessiblityObject(Object object) { + this._accessibilityObject = object; + } + + public Object getAccessibilityObject() { + return this._accessibilityObject; + } public void paintRootElementBackground(RenderingContext c) { PaintingInfo pI = getPaintingInfo(); diff --git a/openhtmltopdf-pdfbox/src/main/java/com/openhtmltopdf/pdfboxout/PdfBoxAccessibilityHelper.java b/openhtmltopdf-pdfbox/src/main/java/com/openhtmltopdf/pdfboxout/PdfBoxAccessibilityHelper.java index de5b955eb..970591508 100644 --- a/openhtmltopdf-pdfbox/src/main/java/com/openhtmltopdf/pdfboxout/PdfBoxAccessibilityHelper.java +++ b/openhtmltopdf-pdfbox/src/main/java/com/openhtmltopdf/pdfboxout/PdfBoxAccessibilityHelper.java @@ -3,8 +3,6 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.List; -import java.util.Map; - import org.apache.pdfbox.cos.COSArray; import org.apache.pdfbox.cos.COSDictionary; import org.apache.pdfbox.cos.COSInteger; @@ -18,14 +16,13 @@ import org.apache.pdfbox.pdmodel.documentinterchange.taggedpdf.StandardStructureTypes; import org.w3c.dom.Document; import org.w3c.dom.Element; -import org.w3c.dom.Node; - import com.openhtmltopdf.extend.StructureType; +import com.openhtmltopdf.render.BlockBox; import com.openhtmltopdf.render.Box; +import com.openhtmltopdf.render.LineBox; import com.openhtmltopdf.render.RenderingContext; public class PdfBoxAccessibilityHelper { - private final Map _structureMap = new HashMap<>(); private final List> _pageContentItems = new ArrayList<>(); private final PdfBoxFastOutputDevice _od; @@ -43,11 +40,12 @@ public PdfBoxAccessibilityHelper(PdfBoxFastOutputDevice od) { private static class StructureItem { private final StructureType type; - private final Box box; + private Box box; private final List children = new ArrayList<>(); private COSDictionary dict; private PDStructureElement elem; + private PDStructureElement parentElem; private int mcid = -1; private StructureItem parent; private PDPage page; @@ -92,9 +90,8 @@ public void finishPdfUa() { root.appendKid(rootElem); - StructureItem rootStruct = _structureMap.get(_doc.getDocumentElement()); - rootStruct.elem = rootElem; - finishStructure(rootStruct); + _root.elem = rootElem; + finishStructure(_root, _root.elem); _od.getWriter().getDocumentCatalog().setStructureTreeRoot(root); } @@ -107,8 +104,8 @@ public void finishPdfUa() { COSArray mcidParentReferences = new COSArray(); for (StructureItem item : pageItems) { - System.out.println("item = " + item + ", parent = " + item.parent + " ,," + item.parent.elem); - mcidParentReferences.add(item.parent.elem); +System.out.println("%%%%%%%item = " + item + ", parent = " + item.parentElem); + mcidParentReferences.add(item.parentElem); } numTree.add(COSInteger.get(i)); @@ -126,31 +123,59 @@ public void finishPdfUa() { } private String chooseTag(StructureItem item) { - return item.box != null && item.box.getStyle().isInline() ? "Span" : "P"; // TODO. + if (item.box != null) { + if (item.box.getLayer() != null) { + return StandardStructureTypes.SECT; + } else if (item.box instanceof BlockBox) { + BlockBox block = (BlockBox) item.box; + + if (block.isFloated()) { + return StandardStructureTypes.NOTE; + } else if (block.isInline()) { + return StandardStructureTypes.SPAN; + } else if (block.getElement() != null && block.getElement().getNodeName().equals("p")) { + return StandardStructureTypes.P; + } else { + return StandardStructureTypes.DIV; + } + + // TODO: Tables. + } else { + return StandardStructureTypes.SPAN; + } + } + + return StandardStructureTypes.SPAN; } - private void finishStructure(StructureItem item) { - System.out.println("item = " + item + " ,," + item.mcid); + private void finishStructure(StructureItem item, PDStructureElement parent) { for (StructureItem child : item.children) { if (child.mcid == -1) { if (child.children.isEmpty()) { continue; } - String pdfTag = chooseTag(child); - - child.elem = new PDStructureElement(pdfTag, item.elem); - System.out.println("child = " + child + "!!!!!!" + child.elem); - child.elem.setParent(item.elem); - child.elem.setPage(child.page); - - item.elem.appendKid(child.elem); + if (child.box instanceof LineBox && + !child.box.hasNonTextContent(_ctx)) { + finishStructure(child, parent); + } else { + String pdfTag = chooseTag(child); - finishStructure(child); + child.parentElem = parent; + child.elem = new PDStructureElement(pdfTag, parent); + child.elem.setParent(parent); + child.elem.setPage(child.page); +System.out.println("ADDING$$: " + child + " :::: " + child.elem + "-----" + pdfTag); + parent.appendKid(child.elem); + + finishStructure(child, child.elem); + } } else if (child.type == StructureType.TEXT) { - item.elem.appendKid(new PDMarkedContent(COSName.getPDFName("Span"), child.dict)); + child.parentElem = parent; + parent.appendKid(new PDMarkedContent(COSName.getPDFName("Span"), child.dict)); } else if (child.type == StructureType.BACKGROUND) { - item.elem.appendKid(new PDArtifactMarkedContent(child.dict)); + child.parentElem = parent; + parent.appendKid(new PDArtifactMarkedContent(child.dict)); } } } @@ -165,29 +190,6 @@ private Element getBoxElement(Box box) { } } - private StructureItem findParentStructualElement(Box box) { - Element elem = getBoxElement(box); - Node parent = elem.getParentNode(); - - StructureItem item; - - if (parent == null || parent instanceof Document) { - item = _root; - } else { - item = _structureMap.get(elem.getParentNode()); - } - - System.out.println("ch = " + box + " parent = " + item + ", " + elem.getParentNode().getNodeName()); - - return item; - } - - private StructureItem findCurrentStructualElement(Box box) { - Element elem = getBoxElement(box); - - return _structureMap.get(elem); - } - private COSDictionary createMarkedContentDictionary() { COSDictionary dict = new COSDictionary(); dict.setInt(COSName.MCID, _nextMcid); @@ -195,35 +197,62 @@ private COSDictionary createMarkedContentDictionary() { return dict; } + private void ensureAncestorTree(StructureItem child, Box parent) { + // Walk up the ancestor tree making sure they all have accessibility objects. + while (parent != null && parent.getAccessibilityObject() == null) { + StructureItem parentItem = createStructureItem(null, parent); + parent.setAccessiblityObject(parentItem); + parentItem.children.add(child); + child.parent = parentItem; + child = parentItem; + parent = parent.getParent(); + } + } + private StructureItem createStructureItem(StructureType type, Box box) { + StructureItem child = (StructureItem) box.getAccessibilityObject(); - Element elem = getBoxElement(box); - StructureItem item = _structureMap.get(elem); - - if (item == null) { - item = new StructureItem(type, box); - _structureMap.put(elem, item); - - item.parent = findParentStructualElement(box); - item.parent.children.add(item); + if (child == null) { + child = new StructureItem(type, box); + child.page = _page; - item.page = _page; + box.setAccessiblityObject(child); + + ensureAncestorTree(child, box.getParent()); + ensureParent(box, child); + } else if (child.box == null) { + child.box = box; } - //System.out.println("-------ADD: " + item + " , &&" + item.parent); - return item; +System.out.println("-------ADD: " + child + " && " + child.parent); + return child; + } + + public void ensureParent(Box box, StructureItem child) { + if (child.parent == null) { + if (box.getParent() != null) { + StructureItem parent = (StructureItem) box.getParent().getAccessibilityObject(); + parent.children.add(child); + child.parent = parent; + } else { + _root.children.add(child); + child.parent = _root; + } + } } private StructureItem createMarkedContentStructureItem(StructureType type, Box box) { StructureItem current = new StructureItem(type, box); - StructureItem parent = findCurrentStructualElement(box); - System.out.println("mcid prent = " + parent + " , " + current); + + ensureAncestorTree(current, box.getParent()); + ensureParent(box, current); + current.mcid = _nextMcid; current.dict = createMarkedContentDictionary(); - current.parent = parent; - current.parent.children.add(current); _pageContentItems.get(_pageContentItems.size() - 1).add(current); + +System.out.println("+++++++ADD: " + current + " !! " + current.parent + " !! " + current.mcid); return current; } @@ -300,7 +329,6 @@ public void setDocument(Document doc) { this._doc = doc; StructureItem rootStruct = new StructureItem(null, null); - _structureMap.put(_doc.getDocumentElement(), rootStruct); _root = rootStruct; } diff --git a/openhtmltopdf-pdfbox/src/main/java/com/openhtmltopdf/pdfboxout/PdfBoxFastOutputDevice.java b/openhtmltopdf-pdfbox/src/main/java/com/openhtmltopdf/pdfboxout/PdfBoxFastOutputDevice.java index 023dbdadd..2fa7f2248 100644 --- a/openhtmltopdf-pdfbox/src/main/java/com/openhtmltopdf/pdfboxout/PdfBoxFastOutputDevice.java +++ b/openhtmltopdf-pdfbox/src/main/java/com/openhtmltopdf/pdfboxout/PdfBoxFastOutputDevice.java @@ -847,7 +847,10 @@ public void start(Document doc) { _bmManager = new PdfBoxBookmarkManager(doc, _writer, _sharedContext, _dotsPerPoint, this); _linkManager = new PdfBoxFastLinkManager(_sharedContext, _dotsPerPoint, _root, this); loadMetadata(doc); - _pdfUa.setDocument(doc); + + if (_pdfUa != null) { + _pdfUa.setDocument(doc); + } } public void finish(RenderingContext c, Box root) {