diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 86847a119..1c6e6b325 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -576,3 +576,4 @@ !bug1770750.pdf !issue16063.pdf !issue16067.pdf +!bug1820909.1.pdf diff --git a/test/pdfs/bug1820909.1.pdf b/test/pdfs/bug1820909.1.pdf new file mode 100755 index 000000000..29774cb80 Binary files /dev/null and b/test/pdfs/bug1820909.1.pdf differ diff --git a/test/pdfs/bug1820909.pdf.link b/test/pdfs/bug1820909.pdf.link new file mode 100644 index 000000000..dcb184dd3 --- /dev/null +++ b/test/pdfs/bug1820909.pdf.link @@ -0,0 +1,2 @@ +https://web.archive.org/web/20221122204959/https://www.unicode.org/charts/PDF/U31350.pdf + diff --git a/test/test_manifest.json b/test/test_manifest.json index 7daa6caeb..f3ffc1ecc 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -7455,5 +7455,12 @@ "rounds": 1, "link": true, "type": "eq" + }, + { + "id": "bug1820909", + "file": "pdfs/bug1820909.pdf", + "md5": "d95a83a868671a03cbf322f16b2e2b9d", + "link": true, + "type": "other" } ] diff --git a/test/unit/pdf_find_controller_spec.js b/test/unit/pdf_find_controller_spec.js index 215cfbd7b..4db369abf 100644 --- a/test/unit/pdf_find_controller_spec.js +++ b/test/unit/pdf_find_controller_spec.js @@ -854,4 +854,50 @@ describe("pdf_find_controller", function () { pageMatchesLength: [[7]], }); }); + + it("performs a search in a text with some UTF-32 chars", async function () { + if (isNodeJS) { + pending("Linked test-cases are not supported in Node.js."); + } + + const { eventBus, pdfFindController } = await initPdfFindController( + "bug1820909.pdf" + ); + + await testSearch({ + eventBus, + pdfFindController, + state: { + query: "31350", + }, + matchesPerPage: [1, 2], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + pageMatches: [[41], [131, 1359]], + pageMatchesLength: [[5], [5, 5]], + }); + }); + + it("performs a search in a text with some UTF-32 chars followed by a dash at the end of a line", async function () { + const { eventBus, pdfFindController } = await initPdfFindController( + "bug1820909.1.pdf" + ); + + await testSearch({ + eventBus, + pdfFindController, + state: { + query: "abcde", + }, + matchesPerPage: [2], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + pageMatches: [[42, 95]], + pageMatchesLength: [[5, 5]], + }); + }); }); diff --git a/web/pdf_find_controller.js b/web/pdf_find_controller.js index f4617e6f9..ecc52295f 100644 --- a/web/pdf_find_controller.js +++ b/web/pdf_find_controller.js @@ -289,21 +289,26 @@ function normalize(text) { // "X-\n" is removed because an hyphen at the end of a line // with not a space before is likely here to mark a break // in a word. - // The \n isn't in the original text so here y = i, n = 1 and o = 2. - positions.push([i - shift + 1, 1 + shift]); + // If X is encoded with UTF-32 then it can have a length greater than 1. + // The \n isn't in the original text so here y = i, n = X.len - 2 and + // o = X.len - 1. + const len = p5.length - 2; + positions.push([i - shift + len, 1 + shift]); shift += 1; shiftOrigin += 1; eol += 1; - return p5.charAt(0); + return p5.slice(0, -2); } if (p6) { // An ideographic at the end of a line doesn't imply adding an extra // white space. - positions.push([i - shift + 1, shift]); + // A CJK can be encoded in UTF-32, hence their length isn't always 1. + const len = p6.length - 1; + positions.push([i - shift + len, shift]); shiftOrigin += 1; eol += 1; - return p6.charAt(0); + return p6.slice(0, -1); } if (p7) {