diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 3e836ce16ca4a..e5b96e6f36c41 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -2317,9 +2317,19 @@ class PartialEvaluator { return; } // Other marked content types aren't supported yet. + let props = null; + if (args[1] instanceof Dict) { + const lang = args[1].get("Lang"); + if (typeof lang === "string") { + props = Object.create(null); + props.lang = stringToPDFString(lang); + } + } + args = [ args[0].name, args[1] instanceof Dict ? args[1].get("MCID") : null, + props, ]; break; @@ -3505,8 +3515,13 @@ class PartialEvaluator { markedContentData.level++; let mcid = null; + let itemLang = null; if (args[1] instanceof Dict) { mcid = args[1].get("MCID"); + const langString = args[1].get("Lang"); + if (typeof langString === "string") { + itemLang = stringToPDFString(langString); + } } textContent.items.push({ type: "beginMarkedContentProps", @@ -3514,6 +3529,7 @@ class PartialEvaluator { ? `${self.idFactory.getPageObjId()}_mc${mcid}` : null, tag: args[0] instanceof Name ? args[0].name : null, + lang: itemLang, }); } break; diff --git a/src/display/api.js b/src/display/api.js index 149ceb2375cd7..bb249baa55942 100644 --- a/src/display/api.js +++ b/src/display/api.js @@ -1166,6 +1166,8 @@ class PDFDocumentProxy { * 'beginMarkedContentProps', or 'endMarkedContent'. * @property {string} id - The marked content identifier. Only used for type * 'beginMarkedContentProps'. + * @property {string|null} tag - The marked content tag. + * @property {string|null} lang - The lang attribute for the marked content. */ /** diff --git a/src/display/text_layer.js b/src/display/text_layer.js index 724924dc6f13a..401df90a141a1 100644 --- a/src/display/text_layer.js +++ b/src/display/text_layer.js @@ -293,6 +293,9 @@ class TextLayer { if (item.id) { this.#container.setAttribute("id", `${item.id}`); } + if (item.lang) { + this.#container.setAttribute("lang", item.lang); + } parent.append(this.#container); } else if (item.type === "endMarkedContent") { this.#container = this.#container.parentNode; diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 75b11be7dc1a2..ab63991a2a1b8 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -752,3 +752,4 @@ !bug1937438_af_from_latex.pdf !bug1937438_from_word.pdf !bug1937438_mml_from_latex.pdf +!marked_content_lang.pdf \ No newline at end of file diff --git a/test/pdfs/marked_content_lang.pdf b/test/pdfs/marked_content_lang.pdf new file mode 100644 index 0000000000000..606eef10afef4 Binary files /dev/null and b/test/pdfs/marked_content_lang.pdf differ diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index a8e0fbc07e68c..b052ac7321f18 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -4488,6 +4488,23 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`) await loadingTask.destroy(); }); + it("gets operatorList, with marked content lang", async function () { + const loadingTask = getDocument( + buildGetDocumentParams("marked_content_lang.pdf") + ); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + const opList = await pdfPage.getOperatorList({ + annotationMode: AnnotationMode.DISABLE, + }); + expect(opList.fnArray[0]).toEqual(OPS.beginMarkedContentProps); + expect(opList.argsArray[0][0]).toEqual("P"); + expect(opList.argsArray[0][2]?.lang).toEqual("en-US"); + expect(opList.fnArray[10]).toEqual(OPS.beginMarkedContentProps); + expect(opList.argsArray[10][0]).toEqual("P"); + expect(opList.argsArray[10][2]?.lang).toEqual("es-ES"); + }); + it("gets operatorList, with page resources containing corrupt /CCITTFaxDecode data", async function () { const loadingTask = getDocument( buildGetDocumentParams("poppler-90-0-fuzzed.pdf") diff --git a/test/unit/text_layer_spec.js b/test/unit/text_layer_spec.js index 644e742458129..9aade28528dbe 100644 --- a/test/unit/text_layer_spec.js +++ b/test/unit/text_layer_spec.js @@ -250,4 +250,29 @@ describe("textLayer", function () { await loadingTask.destroy(); }); + + it("handles lang attribute for marked content", async function () { + if (isNodeJS) { + pending("document.createElement is not supported in Node.js."); + } + const loadingTask = getDocument( + buildGetDocumentParams("marked_content_lang.pdf") + ); + const pdfDocument = await loadingTask.promise; + const page = await pdfDocument.getPage(1); + + const container = document.createElement("div"); + const textLayer = new TextLayer({ + textContentSource: page.streamTextContent({ + includeMarkedContent: true, + }), + container, + viewport: page.getViewport({ scale: 1 }), + }); + await textLayer.render(); + + const span = container.querySelector("#p17R_mc1"); + expect(span.getAttribute("lang")).toEqual("es-ES"); + expect(span.textContent).toEqual("Esto es español"); + }); });