Skip to content

Commit d2a6638

Browse files
committed
Use ActualText when getting the text for the text layer
1 parent 250cc7d commit d2a6638

File tree

4 files changed

+37
-0
lines changed

4 files changed

+37
-0
lines changed

src/core/evaluator.js

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2406,6 +2406,7 @@ class PartialEvaluator {
24062406
transform: null,
24072407
fontName: null,
24082408
hasEOL: false,
2409+
span: "",
24092410
};
24102411

24112412
// Use a circular buffer (length === 2) to save the last chars in the
@@ -3068,6 +3069,16 @@ class PartialEvaluator {
30683069
textContent.items.push(runBidiTransform(textContentItem));
30693070
textContentItem.initialized = false;
30703071
textContentItem.str.length = 0;
3072+
textContentItem.span = "";
3073+
}
3074+
3075+
function replaceTextContentBySpan() {
3076+
const { span, str } = textContentItem;
3077+
if (!span || str.length === 0) {
3078+
return;
3079+
}
3080+
str.length = 0;
3081+
str.push(span);
30713082
}
30723083

30733084
function enqueueChunk(batch = false) {
@@ -3446,6 +3457,11 @@ class PartialEvaluator {
34463457
return;
34473458
case OPS.beginMarkedContent:
34483459
flushTextContentItem();
3460+
if (args[0]?.name === "Span") {
3461+
textContentItem.span = stringToPDFString(
3462+
args[1]?.get("ActualText") || ""
3463+
);
3464+
}
34493465
if (includeMarkedContent) {
34503466
markedContentData.level++;
34513467

@@ -3457,6 +3473,11 @@ class PartialEvaluator {
34573473
break;
34583474
case OPS.beginMarkedContentProps:
34593475
flushTextContentItem();
3476+
if (args[0]?.name === "Span") {
3477+
textContentItem.span = stringToPDFString(
3478+
args[1]?.get("ActualText") || ""
3479+
);
3480+
}
34603481
if (includeMarkedContent) {
34613482
markedContentData.level++;
34623483

@@ -3474,6 +3495,7 @@ class PartialEvaluator {
34743495
}
34753496
break;
34763497
case OPS.endMarkedContent:
3498+
replaceTextContentBySpan();
34773499
flushTextContentItem();
34783500
if (includeMarkedContent) {
34793501
if (markedContentData.level === 0) {

test/pdfs/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -726,3 +726,4 @@
726726
!chrome-text-selection-markedContent.pdf
727727
!bug1963407.pdf
728728
!issue19517.pdf
729+
!issue20007.pdf

test/pdfs/issue20007.pdf

11.3 KB
Binary file not shown.

test/unit/api_spec.js

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3923,6 +3923,20 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
39233923
expect(items[1].fontName).not.toEqual(items[0].fontName);
39243924
});
39253925

3926+
it("get the text a content stream containing some ActualText", async function () {
3927+
const loadingTask = getDocument(buildGetDocumentParams("issue20007.pdf"));
3928+
const pdfDoc = await loadingTask.promise;
3929+
const pdfPage = await pdfDoc.getPage(1);
3930+
3931+
const { items } = await pdfPage.getTextContent({
3932+
disableNormalization: true,
3933+
});
3934+
const text = mergeText(items);
3935+
expect(text).toEqual("The quick brown fox jumps over the lazy dog");
3936+
3937+
await loadingTask.destroy();
3938+
});
3939+
39263940
it("gets empty structure tree", async function () {
39273941
const tree = await page.getStructTree();
39283942

0 commit comments

Comments
 (0)