Skip to content

Commit

Permalink
Merge pull request #16138 from calixteman/bug1820909
Browse files Browse the repository at this point in the history
Fix search in pdf a containing some UTF-32 characters (bug 1820909)
  • Loading branch information
calixteman authored Mar 9, 2023
2 parents a0ef5a4 + 07b0947 commit 0338df2
Show file tree
Hide file tree
Showing 6 changed files with 66 additions and 5 deletions.
1 change: 1 addition & 0 deletions test/pdfs/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -576,3 +576,4 @@
!bug1770750.pdf
!issue16063.pdf
!issue16067.pdf
!bug1820909.1.pdf
Binary file added test/pdfs/bug1820909.1.pdf
Binary file not shown.
2 changes: 2 additions & 0 deletions test/pdfs/bug1820909.pdf.link
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
https://web.archive.org/web/20221122204959/https://www.unicode.org/charts/PDF/U31350.pdf

7 changes: 7 additions & 0 deletions test/test_manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -7455,5 +7455,12 @@
"rounds": 1,
"link": true,
"type": "eq"
},
{
"id": "bug1820909",
"file": "pdfs/bug1820909.pdf",
"md5": "d95a83a868671a03cbf322f16b2e2b9d",
"link": true,
"type": "other"
}
]
46 changes: 46 additions & 0 deletions test/unit/pdf_find_controller_spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -854,4 +854,50 @@ describe("pdf_find_controller", function () {
pageMatchesLength: [[7]],
});
});

it("performs a search in a text with some UTF-32 chars", async function () {
if (isNodeJS) {
pending("Linked test-cases are not supported in Node.js.");
}

const { eventBus, pdfFindController } = await initPdfFindController(
"bug1820909.pdf"
);

await testSearch({
eventBus,
pdfFindController,
state: {
query: "31350",
},
matchesPerPage: [1, 2],
selectedMatch: {
pageIndex: 0,
matchIndex: 0,
},
pageMatches: [[41], [131, 1359]],
pageMatchesLength: [[5], [5, 5]],
});
});

it("performs a search in a text with some UTF-32 chars followed by a dash at the end of a line", async function () {
const { eventBus, pdfFindController } = await initPdfFindController(
"bug1820909.1.pdf"
);

await testSearch({
eventBus,
pdfFindController,
state: {
query: "abcde",
},
matchesPerPage: [2],
selectedMatch: {
pageIndex: 0,
matchIndex: 0,
},
pageMatches: [[42, 95]],
pageMatchesLength: [[5, 5]],
});
});
});
15 changes: 10 additions & 5 deletions web/pdf_find_controller.js
Original file line number Diff line number Diff line change
Expand Up @@ -289,21 +289,26 @@ function normalize(text) {
// "X-\n" is removed because an hyphen at the end of a line
// with not a space before is likely here to mark a break
// in a word.
// The \n isn't in the original text so here y = i, n = 1 and o = 2.
positions.push([i - shift + 1, 1 + shift]);
// If X is encoded with UTF-32 then it can have a length greater than 1.
// The \n isn't in the original text so here y = i, n = X.len - 2 and
// o = X.len - 1.
const len = p5.length - 2;
positions.push([i - shift + len, 1 + shift]);
shift += 1;
shiftOrigin += 1;
eol += 1;
return p5.charAt(0);
return p5.slice(0, -2);
}

if (p6) {
// An ideographic at the end of a line doesn't imply adding an extra
// white space.
positions.push([i - shift + 1, shift]);
// A CJK can be encoded in UTF-32, hence their length isn't always 1.
const len = p6.length - 1;
positions.push([i - shift + len, shift]);
shiftOrigin += 1;
eol += 1;
return p6.charAt(0);
return p6.slice(0, -1);
}

if (p7) {
Expand Down

0 comments on commit 0338df2

Please sign in to comment.