diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index bbd1f456c..2a1db6fe9 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -554,3 +554,4 @@ !bug1796741.pdf !textfields.pdf !freetext_no_appearance.pdf +!issue15690.pdf diff --git a/test/pdfs/issue15690.pdf b/test/pdfs/issue15690.pdf new file mode 100755 index 000000000..fdc09c6ce Binary files /dev/null and b/test/pdfs/issue15690.pdf differ diff --git a/test/unit/pdf_find_controller_spec.js b/test/unit/pdf_find_controller_spec.js index f50ee7840..235e2a6eb 100644 --- a/test/unit/pdf_find_controller_spec.js +++ b/test/unit/pdf_find_controller_spec.js @@ -647,4 +647,25 @@ describe("pdf_find_controller", function () { pageMatchesLength: [[4]], }); }); + + it("performs a search in a text containing fullwidth chars", async function () { + const { eventBus, pdfFindController } = await initPdfFindController( + "issue15690.pdf" + ); + + await testSearch({ + eventBus, + pdfFindController, + state: { + query: "o", + }, + matchesPerPage: [13], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + pageMatches: [[0, 10, 13, 30, 39, 41, 55, 60, 66, 84, 102, 117, 134]], + pageMatchesLength: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], + }); + }); }); diff --git a/web/pdf_find_controller.js b/web/pdf_find_controller.js index a0218035b..a18ff2d2e 100644 --- a/web/pdf_find_controller.js +++ b/web/pdf_find_controller.js @@ -95,6 +95,8 @@ const SYLLABLES_LENGTHS = new Map(); const FIRST_CHAR_SYLLABLES_REG_EXP = "[\\u1100-\\u1112\\ud7a4-\\ud7af\\ud84a\\ud84c\\ud850\\ud854\\ud857\\ud85f]"; +const NFKC_CHARS_TO_NORMALIZE = new Map(); + let noSyllablesRegExp = null; let withSyllablesRegExp = null; @@ -126,7 +128,13 @@ function normalize(text) { } else { // Compile the regular expression for text normalization once. const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join(""); - const regexp = `([${replace}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(\\p{Ideographic}\\n)|(\\n)`; + const toNormalizeWithNFKC = + "\u2460-\u2473" + // Circled numbers. + "\u24b6-\u24ff" + // Circled letters/numbers. + "\u3244-\u32bf" + // Circled ideograms/numbers. + "\u32d0-\u32fe" + // Circled ideograms. + "\uff00-\uffef"; // Halfwidth, fullwidth forms. + const regexp = `([${replace}])|([${toNormalizeWithNFKC}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(\\p{Ideographic}\\n)|(\\n)`; if (syllablePositions.length === 0) { // Most of the syllables belong to Hangul so there are no need @@ -188,11 +196,11 @@ function normalize(text) { normalized = normalized.replace( normalizationRegex, - (match, p1, p2, p3, p4, p5, p6, i) => { + (match, p1, p2, p3, p4, p5, p6, p7, i) => { i -= shiftOrigin; if (p1) { // Maybe fractions or quotations mark... - const replacement = CHARACTERS_TO_NORMALIZE[match]; + const replacement = CHARACTERS_TO_NORMALIZE[p1]; const jj = replacement.length; for (let j = 1; j < jj; j++) { positions.push([i - shift + j, shift - j]); @@ -202,8 +210,23 @@ function normalize(text) { } if (p2) { - const hasTrailingDashEOL = p2.endsWith("\n"); - const len = hasTrailingDashEOL ? p2.length - 2 : p2.length; + // Use the NFKC representation to normalize the char. + let replacement = NFKC_CHARS_TO_NORMALIZE.get(p2); + if (!replacement) { + replacement = p2.normalize("NFKC"); + NFKC_CHARS_TO_NORMALIZE.set(p2, replacement); + } + const jj = replacement.length; + for (let j = 1; j < jj; j++) { + positions.push([i - shift + j, shift - j]); + } + shift -= jj - 1; + return replacement; + } + + if (p3) { + const hasTrailingDashEOL = p3.endsWith("\n"); + const len = hasTrailingDashEOL ? p3.length - 2 : p3.length; // Diacritics. hasDiacritics = true; @@ -223,19 +246,19 @@ function normalize(text) { if (hasTrailingDashEOL) { // Diacritics are followed by a -\n. - // See comments in `if (p3)` block. + // See comments in `if (p4)` block. i += len - 1; positions.push([i - shift + 1, 1 + shift]); shift += 1; shiftOrigin += 1; eol += 1; - return p2.slice(0, len); + return p3.slice(0, len); } - return p2; + return p3; } - if (p3) { + if (p4) { // "X-\n" is removed because an hyphen at the end of a line // with not a space before is likely here to mark a break // in a word. @@ -244,19 +267,19 @@ function normalize(text) { shift += 1; shiftOrigin += 1; eol += 1; - return p3.charAt(0); + return p4.charAt(0); } - if (p4) { + if (p5) { // An ideographic at the end of a line doesn't imply adding an extra // white space. positions.push([i - shift + 1, shift]); shiftOrigin += 1; eol += 1; - return p4.charAt(0); + return p5.charAt(0); } - if (p5) { + if (p6) { // eol is replaced by space: "foo\nbar" is likely equivalent to // "foo bar". positions.push([i - shift + 1, shift - 1]); @@ -266,7 +289,7 @@ function normalize(text) { return " "; } - // p6 + // p7 if (i + eol === syllablePositions[syllableIndex]?.[1]) { // A syllable (1 char) is replaced with several chars (n) so // newCharsLen = n - 1. @@ -278,7 +301,7 @@ function normalize(text) { shift -= newCharLen; shiftOrigin += newCharLen; } - return p6; + return p7; } );