diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 119701c04..d8859cf1d 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -538,3 +538,4 @@ !bug1782186.pdf !tracemonkey_a11y.pdf !bug1782564.pdf +!issue15340.pdf diff --git a/test/pdfs/issue15340.pdf b/test/pdfs/issue15340.pdf new file mode 100644 index 000000000..439f1602f Binary files /dev/null and b/test/pdfs/issue15340.pdf differ diff --git a/test/unit/pdf_find_controller_spec.js b/test/unit/pdf_find_controller_spec.js index 86e6281ac..f50ee7840 100644 --- a/test/unit/pdf_find_controller_spec.js +++ b/test/unit/pdf_find_controller_spec.js @@ -626,4 +626,25 @@ describe("pdf_find_controller", function () { pageMatchesLength: [[8]], }); }); + + it("performs a search in a text containing an ideographic at the end of a line", async function () { + const { eventBus, pdfFindController } = await initPdfFindController( + "issue15340.pdf" + ); + + await testSearch({ + eventBus, + pdfFindController, + state: { + query: "検知機構", + }, + matchesPerPage: [1], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + pageMatches: [[29]], + pageMatchesLength: [[4]], + }); + }); }); diff --git a/web/pdf_find_controller.js b/web/pdf_find_controller.js index cfadec222..a0218035b 100644 --- a/web/pdf_find_controller.js +++ b/web/pdf_find_controller.js @@ -126,7 +126,7 @@ function normalize(text) { } else { // Compile the regular expression for text normalization once. const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join(""); - const regexp = `([${replace}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(\\n)`; + const regexp = `([${replace}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(\\p{Ideographic}\\n)|(\\n)`; if (syllablePositions.length === 0) { // Most of the syllables belong to Hangul so there are no need @@ -188,7 +188,7 @@ function normalize(text) { normalized = normalized.replace( normalizationRegex, - (match, p1, p2, p3, p4, p5, i) => { + (match, p1, p2, p3, p4, p5, p6, i) => { i -= shiftOrigin; if (p1) { // Maybe fractions or quotations mark... @@ -248,6 +248,15 @@ function normalize(text) { } if (p4) { + // An ideographic at the end of a line doesn't imply adding an extra + // white space. + positions.push([i - shift + 1, shift]); + shiftOrigin += 1; + eol += 1; + return p4.charAt(0); + } + + if (p5) { // eol is replaced by space: "foo\nbar" is likely equivalent to // "foo bar". positions.push([i - shift + 1, shift - 1]); @@ -257,7 +266,7 @@ function normalize(text) { return " "; } - // p5 + // p6 if (i + eol === syllablePositions[syllableIndex]?.[1]) { // A syllable (1 char) is replaced with several chars (n) so // newCharsLen = n - 1. @@ -269,7 +278,7 @@ function normalize(text) { shift -= newCharLen; shiftOrigin += newCharLen; } - return p5; + return p6; } );