[Search] Some matches were incorrectly shifted because of some '-\n'

- it aims to fix #14562; - 'X-\n' were not correctly positioned; - when X is a diacritic (e.g. in "sä-\n", which is decomposed into "sa¨-\n") we must handle both things: - diacritics on the one hand; - "-\n" on the other hand.
2022-02-13 16:55:56 +01:00 · 2022-02-13 16:55:56 +01:00 · 18f4e560ae
commit 18f4e560ae
parent 263c89581f
4 changed files with 100 additions and 31 deletions
--- a/test/pdfs/issue14562.pdf.link
+++ b/test/pdfs/issue14562.pdf.link
@ -0,0 +1 @@
+https://github.com/mozilla/pdf.js/files/8055863/NHA-20211231-A-018_before.pdf
--- a/test/test_manifest.json
+++ b/test/test_manifest.json
@ -6271,5 +6271,12 @@
         "value": "PDF.js PDF.js PDF.js"
        }
      }
+   },
+   { "id": "issue14562",
+      "file": "pdfs/issue14562.pdf",
+      "md5": "565b0a1e46a32e907837506a10d25277",
+      "rounds": 1,
+      "link": true,
+      "type": "other"
   }
 ]
--- a/test/unit/pdf_find_controller_spec.js
+++ b/test/unit/pdf_find_controller_spec.js
@ -16,6 +16,7 @@
 import { buildGetDocumentParams } from "./test_utils.js";
 import { EventBus } from "../../web/event_utils.js";
 import { getDocument } from "../../src/display/api.js";
+import { isNodeJS } from "../../src/shared/is_node.js";
 import { PDFFindController } from "../../web/pdf_find_controller.js";
 import { SimpleLinkService } from "../../web/pdf_link_service.js";

@ -554,4 +555,48 @@ describe("pdf_find_controller", function () {
      pageMatchesLength: [[23]],
    });
  });
+
+  it("performs a search in a text containing diacritics before -\\n", async function () {
+    if (isNodeJS) {
+      pending("Linked test-cases are not supported in Node.js.");
+    }
+
+    const { eventBus, pdfFindController } = await initPdfFindController(
+      "issue14562.pdf"
+    );
+
+    await testSearch({
+      eventBus,
+      pdfFindController,
+      state: {
+        query: "ä",
+        matchDiacritics: true,
+      },
+      matchesPerPage: [80],
+      selectedMatch: {
+        pageIndex: 0,
+        matchIndex: 0,
+      },
+      pageMatches: [
+        [
+          299, 337, 414, 476, 623, 797, 978, 984, 1010, 1058, 1079, 1144, 1152,
+          1274, 1343, 1391, 1399, 1421, 1497, 1521, 1527, 1684, 1774, 1786,
+          1857, 1879, 1909, 1946, 2064, 2074, 2161, 2178, 2213, 2227, 2272,
+          2322, 2359, 2401, 2412, 2423, 2462, 2532, 2538, 2553, 2562, 2576,
+          2602, 2613, 2638, 2668, 2792, 2805, 2836, 2848, 2859, 2896, 2902,
+          2916, 2940, 2960, 3091, 3239, 3249, 3339, 3387, 3394, 3468, 3477,
+          3485, 3502, 3690, 3696, 3711, 3758, 3789, 3865, 3977, 4052, 4058,
+          4071,
+        ],
+      ],
+      pageMatchesLength: [
+        [
+          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        ],
+      ],
+    });
+  });
 });
--- a/web/pdf_find_controller.js
+++ b/web/pdf_find_controller.js
@ -96,7 +96,7 @@ function normalize(text) {
    // Compile the regular expression for text normalization once.
    const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join("");
    normalizationRegex = new RegExp(
-      `([${replace}])|(\\S-\\n)|(\\n)|(\\p{M}+)`,
+      `([${replace}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(\\n)`,
      "gum"
    );
  }
@ -159,43 +159,59 @@ function normalize(text) {
      }

      if (p2) {
-        // "X-\n" is removed because an hyphen at the end of a line
-        // with not a space before is likely here to mark a break
-        // in a word.
-        positions.push([i - shift, 1 + shift]);
-        shift += 1;
-        shiftOrigin += 1;
-        eol += 1;
-        return p2.charAt(0);
+        const hasTrailingDashEOL = p2.endsWith("\n");
+        const len = hasTrailingDashEOL ? p2.length - 2 : p2.length;
+
+        // Diacritics.
+        hasDiacritics = true;
+        let jj = len;
+        if (i + eol === rawDiacriticsPositions[k]?.[1]) {
+          jj -= rawDiacriticsPositions[k][0];
+          ++k;
+        }
+
+        for (let j = 1; j < jj + 1; j++) {
+          // i is the position of the first diacritic
+          // so (i - 1) is the position for the letter before.
+          positions.push([i - 1 - shift + j, shift - j]);
+        }
+        shift -= jj;
+        shiftOrigin += jj;
+
+        if (hasTrailingDashEOL) {
+          // Diacritics are followed by a -\n.
+          // See comments in `if (p3)` block.
+          i += len - 1;
+          positions.push([i - shift + 1, 1 + shift]);
+          shift += 1;
+          shiftOrigin += 1;
+          eol += 1;
+          return p2.slice(0, len);
+        }
+
+        return p2;
      }

      if (p3) {
-        // eol is replaced by space: "foo\nbar" is likely equivalent to
-        // "foo bar".
-        positions.push([i - shift + 1, shift - 1]);
-        shift -= 1;
+        // "X-\n" is removed because an hyphen at the end of a line
+        // with not a space before is likely here to mark a break
+        // in a word.
+        // The \n isn't in the original text so here y = i, n = 1 and o = 2.
+        positions.push([i - shift + 1, 1 + shift]);
+        shift += 1;
        shiftOrigin += 1;
        eol += 1;
-        return " ";
+        return p3.charAt(0);
      }

-      // Diacritics.
-      hasDiacritics = true;
-      let jj = p4.length;
-      if (i + eol === rawDiacriticsPositions[k]?.[1]) {
-        jj -= rawDiacriticsPositions[k][0];
-        ++k;
-      }
-
-      for (let j = 1; j < jj + 1; j++) {
-        // i is the position of the first diacritic
-        // so (i - 1) is the position for the letter before.
-        positions.push([i - 1 - shift + j, shift - j]);
-      }
-      shift -= jj;
-      shiftOrigin += jj;
-
-      return p4;
+      // p4
+      // eol is replaced by space: "foo\nbar" is likely equivalent to
+      // "foo bar".
+      positions.push([i - shift + 1, shift - 1]);
+      shift -= 1;
+      shiftOrigin += 1;
+      eol += 1;
+      return " ";
    }
  );
				`@ -0,0 +1 @@`
				`https://github.com/mozilla/pdf.js/files/8055863/NHA-20211231-A-018_before.pdf`