Normalize fullwidth, halfwidth and circled chars when searching

2022-11-14 18:14:08 +01:00 · 2022-11-14 18:14:08 +01:00 · 2be64d63e1
commit 2be64d63e1
parent bfe6ff5893
4 changed files with 60 additions and 15 deletions
--- a/test/pdfs/.gitignore
+++ b/test/pdfs/.gitignore
@ -554,3 +554,4 @@
 !bug1796741.pdf
 !textfields.pdf
 !freetext_no_appearance.pdf
+!issue15690.pdf
--- a/test/pdfs/issue15690.pdf
+++ b/test/pdfs/issue15690.pdf
--- a/test/unit/pdf_find_controller_spec.js
+++ b/test/unit/pdf_find_controller_spec.js
@ -647,4 +647,25 @@ describe("pdf_find_controller", function () {
      pageMatchesLength: [[4]],
    });
  });
+
+  it("performs a search in a text containing fullwidth chars", async function () {
+    const { eventBus, pdfFindController } = await initPdfFindController(
+      "issue15690.pdf"
+    );
+
+    await testSearch({
+      eventBus,
+      pdfFindController,
+      state: {
+        query: "o",
+      },
+      matchesPerPage: [13],
+      selectedMatch: {
+        pageIndex: 0,
+        matchIndex: 0,
+      },
+      pageMatches: [[0, 10, 13, 30, 39, 41, 55, 60, 66, 84, 102, 117, 134]],
+      pageMatchesLength: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
+    });
+  });
 });
--- a/web/pdf_find_controller.js
+++ b/web/pdf_find_controller.js
@ -95,6 +95,8 @@ const SYLLABLES_LENGTHS = new Map();
 const FIRST_CHAR_SYLLABLES_REG_EXP =
  "[\\u1100-\\u1112\\ud7a4-\\ud7af\\ud84a\\ud84c\\ud850\\ud854\\ud857\\ud85f]";

+const NFKC_CHARS_TO_NORMALIZE = new Map();
+
 let noSyllablesRegExp = null;
 let withSyllablesRegExp = null;

@ -126,7 +128,13 @@ function normalize(text) {
  } else {
    // Compile the regular expression for text normalization once.
    const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join("");
-    const regexp = `([${replace}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(\\p{Ideographic}\\n)|(\\n)`;
+    const toNormalizeWithNFKC =
+      "\u2460-\u2473" + // Circled numbers.
+      "\u24b6-\u24ff" + // Circled letters/numbers.
+      "\u3244-\u32bf" + // Circled ideograms/numbers.
+      "\u32d0-\u32fe" + // Circled ideograms.
+      "\uff00-\uffef"; // Halfwidth, fullwidth forms.
+    const regexp = `([${replace}])|([${toNormalizeWithNFKC}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(\\p{Ideographic}\\n)|(\\n)`;

    if (syllablePositions.length === 0) {
      // Most of the syllables belong to Hangul so there are no need
@ -188,11 +196,11 @@ function normalize(text) {

  normalized = normalized.replace(
    normalizationRegex,
-    (match, p1, p2, p3, p4, p5, p6, i) => {
+    (match, p1, p2, p3, p4, p5, p6, p7, i) => {
      i -= shiftOrigin;
      if (p1) {
        // Maybe fractions or quotations mark...
-        const replacement = CHARACTERS_TO_NORMALIZE[match];
+        const replacement = CHARACTERS_TO_NORMALIZE[p1];
        const jj = replacement.length;
        for (let j = 1; j < jj; j++) {
          positions.push([i - shift + j, shift - j]);
@ -202,8 +210,23 @@ function normalize(text) {
      }

      if (p2) {
-        const hasTrailingDashEOL = p2.endsWith("\n");
-        const len = hasTrailingDashEOL ? p2.length - 2 : p2.length;
+        // Use the NFKC representation to normalize the char.
+        let replacement = NFKC_CHARS_TO_NORMALIZE.get(p2);
+        if (!replacement) {
+          replacement = p2.normalize("NFKC");
+          NFKC_CHARS_TO_NORMALIZE.set(p2, replacement);
+        }
+        const jj = replacement.length;
+        for (let j = 1; j < jj; j++) {
+          positions.push([i - shift + j, shift - j]);
+        }
+        shift -= jj - 1;
+        return replacement;
+      }
+
+      if (p3) {
+        const hasTrailingDashEOL = p3.endsWith("\n");
+        const len = hasTrailingDashEOL ? p3.length - 2 : p3.length;

        // Diacritics.
        hasDiacritics = true;
@ -223,19 +246,19 @@ function normalize(text) {

        if (hasTrailingDashEOL) {
          // Diacritics are followed by a -\n.
-          // See comments in `if (p3)` block.
+          // See comments in `if (p4)` block.
          i += len - 1;
          positions.push([i - shift + 1, 1 + shift]);
          shift += 1;
          shiftOrigin += 1;
          eol += 1;
-          return p2.slice(0, len);
+          return p3.slice(0, len);
        }

-        return p2;
+        return p3;
      }

-      if (p3) {
+      if (p4) {
        // "X-\n" is removed because an hyphen at the end of a line
        // with not a space before is likely here to mark a break
        // in a word.
@ -244,19 +267,19 @@ function normalize(text) {
        shift += 1;
        shiftOrigin += 1;
        eol += 1;
-        return p3.charAt(0);
+        return p4.charAt(0);
      }

-      if (p4) {
+      if (p5) {
        // An ideographic at the end of a line doesn't imply adding an extra
        // white space.
        positions.push([i - shift + 1, shift]);
        shiftOrigin += 1;
        eol += 1;
-        return p4.charAt(0);
+        return p5.charAt(0);
      }

-      if (p5) {
+      if (p6) {
        // eol is replaced by space: "foo\nbar" is likely equivalent to
        // "foo bar".
        positions.push([i - shift + 1, shift - 1]);
@ -266,7 +289,7 @@ function normalize(text) {
        return " ";
      }

-      // p6
+      // p7
      if (i + eol === syllablePositions[syllableIndex]?.[1]) {
        // A syllable (1 char) is replaced with several chars (n) so
        // newCharsLen = n - 1.
@ -278,7 +301,7 @@ function normalize(text) {
        shift -= newCharLen;
        shiftOrigin += newCharLen;
      }
-      return p6;
+      return p7;
    }
  );