diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 17b77b87e..5ea4bb2de 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -574,3 +574,4 @@ !bug1815476.pdf !issue16021.pdf !bug1770750.pdf +!issue16063.pdf diff --git a/test/pdfs/issue16063.pdf b/test/pdfs/issue16063.pdf new file mode 100755 index 000000000..36ddfd9e1 Binary files /dev/null and b/test/pdfs/issue16063.pdf differ diff --git a/test/unit/pdf_find_controller_spec.js b/test/unit/pdf_find_controller_spec.js index 22c83defb..215cfbd7b 100644 --- a/test/unit/pdf_find_controller_spec.js +++ b/test/unit/pdf_find_controller_spec.js @@ -818,4 +818,40 @@ describe("pdf_find_controller", function () { }, }); }); + + it("performs a search in a text with some Hiragana diacritics at the end of a line", async function () { + const { eventBus, pdfFindController } = await initPdfFindController( + "issue16063.pdf" + ); + + await testSearch({ + eventBus, + pdfFindController, + state: { + query: "行うことができる速結端子", + }, + matchesPerPage: [1], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + pageMatches: [[63]], + pageMatchesLength: [[12]], + }); + + await testSearch({ + eventBus, + pdfFindController, + state: { + query: "デュプレックス", + }, + matchesPerPage: [1], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + pageMatches: [[205]], + pageMatchesLength: [[7]], + }); + }); }); diff --git a/web/pdf_find_controller.js b/web/pdf_find_controller.js index 3af01326c..f4617e6f9 100644 --- a/web/pdf_find_controller.js +++ b/web/pdf_find_controller.js @@ -136,7 +136,8 @@ function normalize(text) { // 3040-309F: Hiragana // 30A0-30FF: Katakana const CJK = "(?:\\p{Ideographic}|[\u3040-\u30FF])"; - const regexp = `([${replace}])|([${toNormalizeWithNFKC}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(${CJK}\\n)|(\\n)`; + const HKDiacritics = "(?:\u3099|\u309A)"; + const regexp = `([${replace}])|([${toNormalizeWithNFKC}])|(${HKDiacritics}\\n)|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(${CJK}\\n)|(\\n)`; if (syllablePositions.length === 0) { // Most of the syllables belong to Hangul so there are no need @@ -198,7 +199,7 @@ function normalize(text) { normalized = normalized.replace( normalizationRegex, - (match, p1, p2, p3, p4, p5, p6, p7, i) => { + (match, p1, p2, p3, p4, p5, p6, p7, p8, i) => { i -= shiftOrigin; if (p1) { // Maybe fractions or quotations mark... @@ -227,8 +228,32 @@ function normalize(text) { } if (p3) { - const hasTrailingDashEOL = p3.endsWith("\n"); - const len = hasTrailingDashEOL ? p3.length - 2 : p3.length; + // We've a Katakana-Hiragana diacritic followed by a \n so don't replace + // the \n by a whitespace. + hasDiacritics = true; + + // Diacritic. + if (i + eol === rawDiacriticsPositions[rawDiacriticsIndex]?.[1]) { + ++rawDiacriticsIndex; + } else { + // i is the position of the first diacritic + // so (i - 1) is the position for the letter before. + positions.push([i - 1 - shift + 1, shift - 1]); + shift -= 1; + shiftOrigin += 1; + } + + // End-of-line. + positions.push([i - shift + 1, shift]); + shiftOrigin += 1; + eol += 1; + + return p3.charAt(0); + } + + if (p4) { + const hasTrailingDashEOL = p4.endsWith("\n"); + const len = hasTrailingDashEOL ? p4.length - 2 : p4.length; // Diacritics. hasDiacritics = true; @@ -248,19 +273,19 @@ function normalize(text) { if (hasTrailingDashEOL) { // Diacritics are followed by a -\n. - // See comments in `if (p4)` block. + // See comments in `if (p5)` block. i += len - 1; positions.push([i - shift + 1, 1 + shift]); shift += 1; shiftOrigin += 1; eol += 1; - return p3.slice(0, len); + return p4.slice(0, len); } - return p3; + return p4; } - if (p4) { + if (p5) { // "X-\n" is removed because an hyphen at the end of a line // with not a space before is likely here to mark a break // in a word. @@ -269,19 +294,19 @@ function normalize(text) { shift += 1; shiftOrigin += 1; eol += 1; - return p4.charAt(0); + return p5.charAt(0); } - if (p5) { + if (p6) { // An ideographic at the end of a line doesn't imply adding an extra // white space. positions.push([i - shift + 1, shift]); shiftOrigin += 1; eol += 1; - return p5.charAt(0); + return p6.charAt(0); } - if (p6) { + if (p7) { // eol is replaced by space: "foo\nbar" is likely equivalent to // "foo bar". positions.push([i - shift + 1, shift - 1]); @@ -291,7 +316,7 @@ function normalize(text) { return " "; } - // p7 + // p8 if (i + eol === syllablePositions[syllableIndex]?.[1]) { // A syllable (1 char) is replaced with several chars (n) so // newCharsLen = n - 1. @@ -303,7 +328,7 @@ function normalize(text) { shift -= newCharLen; shiftOrigin += newCharLen; } - return p7; + return p8; } );