Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions src/core/evaluator.js
Original file line number Diff line number Diff line change
Expand Up @@ -2317,9 +2317,19 @@ class PartialEvaluator {
return;
}
// Other marked content types aren't supported yet.
let props = null;
if (args[1] instanceof Dict) {
const lang = args[1].get("Lang");
if (typeof lang === "string") {
props = Object.create(null);
props.lang = stringToPDFString(lang);
}
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As said in the test, I don't understand why it's useful.
That said, I don't really see the point of having the MCID neither...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMHO, it's correct for an API called getOperatorList to return the complete list of operators and their arguments, as well as the MCID. It's then up to the rendering scripts to decide whether or not to use that data.

args = [
args[0].name,
args[1] instanceof Dict ? args[1].get("MCID") : null,
props,
];

break;
Expand Down Expand Up @@ -3505,15 +3515,21 @@ class PartialEvaluator {
markedContentData.level++;

let mcid = null;
let itemLang = null;
if (args[1] instanceof Dict) {
mcid = args[1].get("MCID");
const langString = args[1].get("Lang");
if (typeof langString === "string") {
itemLang = stringToPDFString(langString);
}
}
textContent.items.push({
type: "beginMarkedContentProps",
id: Number.isInteger(mcid)
? `${self.idFactory.getPageObjId()}_mc${mcid}`
: null,
tag: args[0] instanceof Name ? args[0].name : null,
lang: itemLang,
});
}
break;
Expand Down
2 changes: 2 additions & 0 deletions src/display/api.js
Original file line number Diff line number Diff line change
Expand Up @@ -1166,6 +1166,8 @@ class PDFDocumentProxy {
* 'beginMarkedContentProps', or 'endMarkedContent'.
* @property {string} id - The marked content identifier. Only used for type
* 'beginMarkedContentProps'.
* @property {string|null} tag - The marked content tag.
* @property {string|null} lang - The lang attribute for the marked content.
*/

/**
Expand Down
3 changes: 3 additions & 0 deletions src/display/text_layer.js
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,9 @@ class TextLayer {
if (item.id) {
this.#container.setAttribute("id", `${item.id}`);
}
if (item.lang) {
this.#container.setAttribute("lang", item.lang);
}
parent.append(this.#container);
} else if (item.type === "endMarkedContent") {
this.#container = this.#container.parentNode;
Expand Down
1 change: 1 addition & 0 deletions test/pdfs/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -752,3 +752,4 @@
!bug1937438_af_from_latex.pdf
!bug1937438_from_word.pdf
!bug1937438_mml_from_latex.pdf
!marked_content_lang.pdf
Binary file added test/pdfs/marked_content_lang.pdf
Binary file not shown.
17 changes: 17 additions & 0 deletions test/unit/api_spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -4488,6 +4488,23 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
await loadingTask.destroy();
});

it("gets operatorList, with marked content lang", async function () {
const loadingTask = getDocument(
buildGetDocumentParams("marked_content_lang.pdf")
);
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(1);
const opList = await pdfPage.getOperatorList({
annotationMode: AnnotationMode.DISABLE,
});
expect(opList.fnArray[0]).toEqual(OPS.beginMarkedContentProps);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is it useful to get the lang parameter in the operator list which is used to draw the pdf ?

expect(opList.argsArray[0][0]).toEqual("P");
expect(opList.argsArray[0][2]?.lang).toEqual("en-US");
expect(opList.fnArray[10]).toEqual(OPS.beginMarkedContentProps);
expect(opList.argsArray[10][0]).toEqual("P");
expect(opList.argsArray[10][2]?.lang).toEqual("es-ES");
});

it("gets operatorList, with page resources containing corrupt /CCITTFaxDecode data", async function () {
const loadingTask = getDocument(
buildGetDocumentParams("poppler-90-0-fuzzed.pdf")
Expand Down
25 changes: 25 additions & 0 deletions test/unit/text_layer_spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -250,4 +250,29 @@ describe("textLayer", function () {

await loadingTask.destroy();
});

it("handles lang attribute for marked content", async function () {
if (isNodeJS) {
pending("document.createElement is not supported in Node.js.");
}
const loadingTask = getDocument(
buildGetDocumentParams("marked_content_lang.pdf")
);
const pdfDocument = await loadingTask.promise;
const page = await pdfDocument.getPage(1);

const container = document.createElement("div");
const textLayer = new TextLayer({
textContentSource: page.streamTextContent({
includeMarkedContent: true,
}),
container,
viewport: page.getViewport({ scale: 1 }),
});
await textLayer.render();

const span = container.querySelector("#p17R_mc1");
expect(span.getAttribute("lang")).toEqual("es-ES");
expect(span.textContent).toEqual("Esto es español");
});
});