From 1437eb85b0b1c674051aa20b6a192a2a63426662 Mon Sep 17 00:00:00 2001 From: Quentin Ligier Date: Wed, 3 Mar 2021 18:16:35 +0100 Subject: [PATCH 1/2] Improve support for PDF/A and PDF/UA --- .../com/openhtmltopdf/util/LogMessageId.java | 1 + .../pdfboxout/PdfBoxRenderer.java | 220 ++++++++++-------- .../pdfboxout/PdfRendererBuilder.java | 17 +- 3 files changed, 134 insertions(+), 104 deletions(-) diff --git a/openhtmltopdf-core/src/main/java/com/openhtmltopdf/util/LogMessageId.java b/openhtmltopdf-core/src/main/java/com/openhtmltopdf/util/LogMessageId.java index 517798c05..7de6c8f5a 100644 --- a/openhtmltopdf-core/src/main/java/com/openhtmltopdf/util/LogMessageId.java +++ b/openhtmltopdf-core/src/main/java/com/openhtmltopdf/util/LogMessageId.java @@ -42,6 +42,7 @@ enum LogMessageId0Param implements LogMessageId { GENERAL_PDF_SPECIFIED_FONTS_DONT_CONTAIN_A_SPACE_CHARACTER(XRLog.GENERAL, "Specified fonts don't contain a space character!"), GENERAL_PDF_USING_FAST_MODE(XRLog.GENERAL, "Using fast-mode renderer. Prepare to fly."), GENERAL_PDF_ACCESSIBILITY_NO_DOCUMENT_TITLE_PROVIDED(XRLog.GENERAL, "No document title provided. Document will not be PDF/UA compliant."), + GENERAL_PDF_ACCESSIBILITY_NO_DOCUMENT_DESCRIPTION_PROVIDED(XRLog.GENERAL, "No document description provided. Document will not be PDF/UA compliant."), GENERAL_PDF_USING_GET_REQUEST_FOR_FORM(XRLog.GENERAL, "Using GET request method for form. You probably meant to add a method=\"post\" attribute to your form"), GENERAL_PDF_ACROBAT_READER_DOES_NOT_SUPPORT_FORMS_WITH_FILE_INPUT(XRLog.GENERAL, "Acrobat Reader does not support forms with file input controls"), diff --git a/openhtmltopdf-pdfbox/src/main/java/com/openhtmltopdf/pdfboxout/PdfBoxRenderer.java b/openhtmltopdf-pdfbox/src/main/java/com/openhtmltopdf/pdfboxout/PdfBoxRenderer.java index 6d5cec53a..33e587462 100644 --- a/openhtmltopdf-pdfbox/src/main/java/com/openhtmltopdf/pdfboxout/PdfBoxRenderer.java +++ b/openhtmltopdf-pdfbox/src/main/java/com/openhtmltopdf/pdfboxout/PdfBoxRenderer.java @@ -35,6 +35,7 @@ import com.openhtmltopdf.outputdevice.helper.ExternalResourceControlPriority; import com.openhtmltopdf.outputdevice.helper.ExternalResourceType; import com.openhtmltopdf.extend.FSDOMMutator; +import com.openhtmltopdf.outputdevice.helper.NullUserInterface; import com.openhtmltopdf.outputdevice.helper.PageDimensions; import com.openhtmltopdf.outputdevice.helper.UnicodeImplementation; import com.openhtmltopdf.pdfboxout.PdfBoxSlowOutputDevice.Metadata; @@ -65,11 +66,7 @@ import org.apache.pdfbox.pdmodel.graphics.color.PDOutputIntent; import org.apache.pdfbox.pdmodel.interactive.viewerpreferences.PDViewerPreferences; import org.apache.xmpbox.XMPMetadata; -import org.apache.xmpbox.schema.AdobePDFSchema; -import org.apache.xmpbox.schema.DublinCoreSchema; -import org.apache.xmpbox.schema.PDFAIdentificationSchema; -import org.apache.xmpbox.schema.XMPBasicSchema; -import org.apache.xmpbox.schema.XMPSchema; +import org.apache.xmpbox.schema.*; import org.apache.xmpbox.type.BadFieldValueException; import org.apache.xmpbox.xml.XmpSerializer; import org.w3c.dom.Document; @@ -84,6 +81,8 @@ import java.awt.*; import java.awt.geom.Rectangle2D; import java.io.*; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; import java.util.Calendar; import java.util.List; import java.util.logging.Level; @@ -604,10 +603,8 @@ private void writePDFFast(List pages, RenderingContext c, Rectangle2D f firePreWrite(pageCount); // opportunity to adjust meta data setDidValues(doc); // set PDF header fields from meta data - if (_pdfUaConformance) { - addPdfUaXMPSchema(doc); - } else if (_pdfAConformance != PdfAConformance.NONE) { - addPdfASchema(doc, _pdfAConformance.getPart(), _pdfAConformance.getConformanceValue()); + if (_pdfUaConformance || _pdfAConformance != PdfAConformance.NONE) { + addPdfASchema(doc, _pdfAConformance, _pdfUaConformance); } DisplayListCollector dlCollector = new DisplayListCollector(_root.getLayer().getPages()); @@ -685,8 +682,8 @@ private void writePDF(List pages, RenderingContext c, Rectangle2D first firePreWrite(pageCount); // opportunity to adjust meta data setDidValues(doc); // set PDF header fields from meta data - if (_pdfAConformance != PdfAConformance.NONE) { - addPdfASchema(doc, _pdfAConformance.getPart(), _pdfAConformance.getConformanceValue()); + if (_pdfUaConformance || _pdfAConformance != PdfAConformance.NONE) { + addPdfASchema(doc, _pdfAConformance, _pdfUaConformance); } for (int i = 0; i < pageCount; i++) { @@ -712,107 +709,104 @@ private void writePDF(List pages, RenderingContext c, Rectangle2D first // Kindly provided by GurpusMaximus at: // https://stackoverflow.com/questions/49682339/how-can-i-create-an-accessible-pdf-with-java-pdfbox-2-0-8-library-that-is-also-v - private void addPdfUaXMPSchema(PDDocument doc) { - try - { - PDDocumentCatalog catalog = doc.getDocumentCatalog(); - String lang = _doc.getDocumentElement().getAttribute("lang"); - catalog.setLanguage(!lang.isEmpty() ? lang : "EN-US"); - catalog.setViewerPreferences(new PDViewerPreferences(new COSDictionary())); - catalog.getViewerPreferences().setDisplayDocTitle(true); - - PDMarkInfo markInfo = new PDMarkInfo(); - markInfo.setMarked(true); - catalog.setMarkInfo(markInfo); - - PDDocumentInformation info = doc.getDocumentInformation(); - String title = info.getTitle() != null ? info.getTitle() : ""; - - if (title.isEmpty()) { - XRLog.log(Level.WARNING, LogMessageId.LogMessageId0Param.GENERAL_PDF_ACCESSIBILITY_NO_DOCUMENT_TITLE_PROVIDED); - } - - XMPMetadata xmp = XMPMetadata.createXMPMetadata(); - xmp.createAndAddDublinCoreSchema(); - xmp.getDublinCoreSchema().setTitle(title); - String metaDescription = _outputDevice.getMetadataByName("description"); - xmp.getDublinCoreSchema().setDescription(metaDescription != null ? metaDescription : title); - xmp.createAndAddPDFAExtensionSchemaWithDefaultNS(); - xmp.getPDFExtensionSchema().addNamespace( - "http://www.aiim.org/pdfa/ns/schema#", "pdfaSchema"); - xmp.getPDFExtensionSchema().addNamespace( - "http://www.aiim.org/pdfa/ns/property#", "pdfaProperty"); - xmp.getPDFExtensionSchema().addNamespace( - "http://www.aiim.org/pdfua/ns/id/", "pdfuaid"); - XMPSchema uaSchema = new XMPSchema(XMPMetadata.createXMPMetadata(), - "pdfaSchema", "pdfaSchema", "pdfaSchema"); - uaSchema.setTextPropertyValue("schema", - "PDF/UA Universal Accessibility Schema"); - uaSchema.setTextPropertyValue("namespaceURI", - "http://www.aiim.org/pdfua/ns/id/"); - uaSchema.setTextPropertyValue("prefix", "pdfuaid"); - XMPSchema uaProp = new XMPSchema(XMPMetadata.createXMPMetadata(), - "pdfaProperty", "pdfaProperty", "pdfaProperty"); - uaProp.setTextPropertyValue("name", "part"); - uaProp.setTextPropertyValue("valueType", "Integer"); - uaProp.setTextPropertyValue("category", "internal"); - uaProp.setTextPropertyValue("description", - "Indicates, which part of ISO 14289 standard is followed"); - uaSchema.addUnqualifiedSequenceValue("property", uaProp); - xmp.getPDFExtensionSchema().addBagValue("schemas", uaSchema); - xmp.getPDFExtensionSchema().setPrefix("pdfuaid"); - xmp.getPDFExtensionSchema().setTextPropertyValue("part", "1"); - XmpSerializer serializer = new XmpSerializer(); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - serializer.serialize(xmp, baos, true); - PDMetadata metadata = new PDMetadata(doc); - metadata.importXMPMetadata(baos.toByteArray()); - doc.getDocumentCatalog().setMetadata(metadata); - } catch (IOException|TransformerException e) { - throw new RuntimeException(e); - } - } - - private void addPdfASchema(PDDocument document, int part, String conformance) { + private void addPdfASchema(PDDocument document, PdfAConformance pdfAConformance, boolean isPdfUa) { PDDocumentInformation information = document.getDocumentInformation(); XMPMetadata metadata = XMPMetadata.createXMPMetadata(); try { + // NOTE: These XMP metadata MUST match up with the document information dictionary + // to be a valid PDF/A document, As per ISO 19005-1:2005/Cor.1:2007, 6.7.2 String title = information.getTitle(); String author = information.getAuthor(); String subject = information.getSubject(); String keywords = information.getKeywords(); + String creator = information.getCreator(); String producer = information.getProducer(); - - // NOTE: The XMP metadata MUST match up with the document information dictionary - // to be a valid PDF/A document. - - PDFAIdentificationSchema pdfaid = metadata.createAndAddPFAIdentificationSchema(); - pdfaid.setConformance(conformance); - pdfaid.setPart(part); + Calendar creationDate = information.getCreationDate(); + Calendar modDate = information.getModificationDate(); - AdobePDFSchema pdfSchema = metadata.createAndAddAdobePDFSchema(); - if (keywords != null) { - pdfSchema.setKeywords(keywords); - } - if (producer != null) { - pdfSchema.setProducer(producer); + if (isPdfUa && (title == null || title.isEmpty())) { + XRLog.log(Level.WARNING, LogMessageId.LogMessageId0Param.GENERAL_PDF_ACCESSIBILITY_NO_DOCUMENT_TITLE_PROVIDED); } - - XMPBasicSchema xmpBasicSchema = metadata.createAndAddXMPBasicSchema(); - xmpBasicSchema.setCreateDate(information.getCreationDate()); - - DublinCoreSchema dc = metadata.createAndAddDublinCoreSchema(); - if (author != null) { - dc.addCreator(author); + + if (pdfAConformance != PdfAConformance.NONE) { + PDFAIdentificationSchema pdfaid = metadata.createAndAddPFAIdentificationSchema(); + pdfaid.setConformance(pdfAConformance.getConformanceValue()); + pdfaid.setPart(pdfAConformance.getPart()); + + AdobePDFSchema pdfSchema = metadata.createAndAddAdobePDFSchema(); + pdfSchema.setPDFVersion(String.valueOf(pdfAConformance.getPdfVersion())); + if (keywords != null) { + pdfSchema.setKeywords(keywords); + } + if (producer != null) { + pdfSchema.setProducer(producer); + } + + XMPBasicSchema xmpBasicSchema = metadata.createAndAddXMPBasicSchema(); + if (creator != null) { + xmpBasicSchema.setCreatorTool(creator); + } + if (creationDate != null) { + xmpBasicSchema.setCreateDate(creationDate); + } + if (modDate != null) { + xmpBasicSchema.setModifyDate(modDate); + } + + + DublinCoreSchema dc = metadata.createAndAddDublinCoreSchema(); + dc.setFormat("application/pdf"); + if (author != null) { + dc.addCreator(author); + } + if (title != null) { + dc.setTitle(title); + } + if (subject != null) { + dc.setDescription(subject); + } else if (isPdfUa) { + XRLog.log(Level.WARNING, LogMessageId.LogMessageId0Param.GENERAL_PDF_ACCESSIBILITY_NO_DOCUMENT_DESCRIPTION_PROVIDED); + } } - if (title != null) { - dc.setTitle(title); + + PDFAExtensionSchema pdfAExt = metadata.createAndAddPDFAExtensionSchemaWithDefaultNS(); + pdfAExt.addNamespace("http://www.aiim.org/pdfa/ns/extension/", "pdfaExtension"); + pdfAExt.addNamespace("http://www.aiim.org/pdfa/ns/schema#", "pdfaSchema"); + pdfAExt.addNamespace("http://www.aiim.org/pdfa/ns/property#", "pdfaProperty"); + + if (pdfAConformance != PdfAConformance.NONE) { + // Description of Adobe PDF Schema + List pdfProperties = new ArrayList<>(3); + pdfProperties.add( + createPdfaProperty("internal", "The PDF file version.", "PDFVersion", "Text")); + pdfProperties.add( + createPdfaProperty("external", "Keywords.", "Keywords", "Text")); + pdfProperties.add( + createPdfaProperty("internal", "The name of the tool that created the PDF document.", "Producer", "AgentName")); + pdfAExt.addBagValue("schemas", + createPdfaSchema("Adobe PDF Schema", "http://ns.adobe.com/pdf/1.3/", "pdf", pdfProperties)); + + // Description of PDF/A ID Schema + List pdfaidProperties = new ArrayList<>(2); + pdfaidProperties.add( + createPdfaProperty("internal", "Part of PDF/A standard", "part", "Integer")); + pdfaidProperties.add( + createPdfaProperty("internal", "Conformance level of PDF/A standard", "conformance", "Text")); + pdfAExt.addBagValue("schemas", + createPdfaSchema("PDF/A ID Schema", "http://www.aiim.org/pdfa/ns/id/", "pdfaid", pdfaidProperties)); } - if (subject != null) { - dc.setDescription(subject); + if (isPdfUa) { + // Description of PDF/UA + List pdfUaProperties = new ArrayList<>(1); + pdfUaProperties.add( + createPdfaProperty("internal", "Indicates, which part of ISO 14289 standard is followed", "part", "Integer")); + XMPSchema pdfUa = createPdfaSchema("PDF/UA Universal Accessibility Schema", "http://www.aiim.org/pdfua/ns/id/", "pdfuaid" , pdfUaProperties); + pdfAExt.addBagValue("schemas", pdfUa); + pdfAExt.addNamespace("http://www.aiim.org/pdfua/ns/id/", "pdfuaid"); + pdfAExt.setTextPropertyValue("pdfuaid:part", "1"); } - + PDMetadata metadataStream = new PDMetadata(document); PDMarkInfo markInfo = new PDMarkInfo(); markInfo.setMarked(true); @@ -830,7 +824,11 @@ private void addPdfASchema(PDDocument document, int part, String conformance) { XmpSerializer serializer = new XmpSerializer(); ByteArrayOutputStream baos = new ByteArrayOutputStream(); serializer.serialize(metadata, baos, true); - metadataStream.importXMPMetadata( baos.toByteArray() ); + String xmp = baos.toString("UTF-8"); + // Fixes for bad XML generation + xmp = xmp.replace("lang=", "xml:lang="); + xmp = xmp.replace("pdfaExtension:pdfuaid:part", "pdfuaid:part"); + metadataStream.importXMPMetadata(xmp.getBytes(StandardCharsets.UTF_8)); if (_colorProfile != null) { ByteArrayInputStream colorProfile = new ByteArrayInputStream(_colorProfile); @@ -846,6 +844,30 @@ private void addPdfASchema(PDDocument document, int part, String conformance) { } } + // Creates an XML Schema to be used in the PDFA Extension + private XMPSchema createPdfaSchema(String schema, String namespace, String prefix, List properties) { + XMPSchema xmpSchema = new XMPSchema(XMPMetadata.createXMPMetadata(), + "pdfaSchema", "pdfaSchema", "pdfaSchema"); + xmpSchema.setTextPropertyValue("schema", schema); + xmpSchema.setTextPropertyValue("namespaceURI", namespace); + xmpSchema.setTextPropertyValue("prefix", prefix); + for (XMPSchema property : properties) { + xmpSchema.addUnqualifiedSequenceValue("property", property); + } + return xmpSchema; + } + + // Creates an XML Property to be used in the PDFA Extension + private XMPSchema createPdfaProperty(String category, String description, String name, String valueType) { + XMPSchema xmpSchema = new XMPSchema(XMPMetadata.createXMPMetadata(), + "pdfaProperty", "pdfaProperty", "pdfaProperty"); + xmpSchema.setTextPropertyValue("name", name); + xmpSchema.setTextPropertyValue("valueType", valueType); + xmpSchema.setTextPropertyValue("category", category); + xmpSchema.setTextPropertyValue("description", description); + return xmpSchema; + } + // Sets the document information dictionary values from html metadata private void setDidValues(PDDocument doc) { PDDocumentInformation info = new PDDocumentInformation(); diff --git a/openhtmltopdf-pdfbox/src/main/java/com/openhtmltopdf/pdfboxout/PdfRendererBuilder.java b/openhtmltopdf-pdfbox/src/main/java/com/openhtmltopdf/pdfboxout/PdfRendererBuilder.java index 7b69fca75..78a7b4d7e 100644 --- a/openhtmltopdf-pdfbox/src/main/java/com/openhtmltopdf/pdfboxout/PdfRendererBuilder.java +++ b/openhtmltopdf-pdfbox/src/main/java/com/openhtmltopdf/pdfboxout/PdfRendererBuilder.java @@ -173,6 +173,7 @@ public PdfRendererBuilder usePdfVersion(float version) { */ public PdfRendererBuilder usePdfAConformance(PdfAConformance pdfAConformance) { this.state._pdfAConformance = pdfAConformance; + this.state._pdfVersion = pdfAConformance.getPdfVersion(); return this; } @@ -301,18 +302,20 @@ public PdfRendererBuilder useSlowMode() { * PDF/A-1, PDF/A-2 and PDF/A-3 */ public enum PdfAConformance { - NONE(-1, ""), - PDFA_1_A(1, "A"), PDFA_1_B(1, "B"), - PDFA_2_A(2, "A"), PDFA_2_B(2, "B"), PDFA_2_U(2, "U"), - PDFA_3_A(3, "A"), PDFA_3_B(3, "B"), PDFA_3_U(3, "U"); + NONE(-1, "", 0f), + PDFA_1_A(1, "A", 1.4f), PDFA_1_B(1, "B", 1.4f), + PDFA_2_A(2, "A", 1.7f), PDFA_2_B(2, "B", 1.7f), PDFA_2_U(2, "U", 1.7f), + PDFA_3_A(3, "A", 1.7f), PDFA_3_B(3, "B", 1.7f), PDFA_3_U(3, "U", 1.7f); - PdfAConformance(int part, String value) { + PdfAConformance(int part, String value, float pdfVersion) { this.part = part; this.value = value; + this.pdfVersion = pdfVersion; } private final int part; private final String value; + private final float pdfVersion; public String getConformanceValue() { return this.value; @@ -321,6 +324,10 @@ public String getConformanceValue() { public int getPart() { return this.part; } + + public float getPdfVersion() { + return this.pdfVersion; + } } } From 19e9ec769ebebd7fb6039d89462f78cda51febd0 Mon Sep 17 00:00:00 2001 From: Quentin Ligier Date: Thu, 4 Mar 2021 22:32:07 +0100 Subject: [PATCH 2/2] Fix PDF/UA property prefix and improve the bad XML generation fix --- .../java/com/openhtmltopdf/pdfboxout/PdfBoxRenderer.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/openhtmltopdf-pdfbox/src/main/java/com/openhtmltopdf/pdfboxout/PdfBoxRenderer.java b/openhtmltopdf-pdfbox/src/main/java/com/openhtmltopdf/pdfboxout/PdfBoxRenderer.java index 33e587462..9e91e2c36 100644 --- a/openhtmltopdf-pdfbox/src/main/java/com/openhtmltopdf/pdfboxout/PdfBoxRenderer.java +++ b/openhtmltopdf-pdfbox/src/main/java/com/openhtmltopdf/pdfboxout/PdfBoxRenderer.java @@ -804,7 +804,8 @@ private void addPdfASchema(PDDocument document, PdfAConformance pdfAConformance, XMPSchema pdfUa = createPdfaSchema("PDF/UA Universal Accessibility Schema", "http://www.aiim.org/pdfua/ns/id/", "pdfuaid" , pdfUaProperties); pdfAExt.addBagValue("schemas", pdfUa); pdfAExt.addNamespace("http://www.aiim.org/pdfua/ns/id/", "pdfuaid"); - pdfAExt.setTextPropertyValue("pdfuaid:part", "1"); + pdfAExt.setPrefix("pdfuaid"); + pdfAExt.setTextPropertyValue("part", "1"); } PDMetadata metadataStream = new PDMetadata(document); @@ -825,9 +826,8 @@ private void addPdfASchema(PDDocument document, PdfAConformance pdfAConformance, ByteArrayOutputStream baos = new ByteArrayOutputStream(); serializer.serialize(metadata, baos, true); String xmp = baos.toString("UTF-8"); - // Fixes for bad XML generation - xmp = xmp.replace("lang=", "xml:lang="); - xmp = xmp.replace("pdfaExtension:pdfuaid:part", "pdfuaid:part"); + // Fix for bad XML generation by some transformers + xmp = xmp.replace(" lang=\"x-default\"", " xml:lang=\"x-default\""); metadataStream.importXMPMetadata(xmp.getBytes(StandardCharsets.UTF_8)); if (_colorProfile != null) {