Fix search in pdf a containing some UTF-32 characters (bug 1820909)

Some chars were supposed to have a length equals to 1 but UTF-32 chars can be longuer.
2023-03-09 14:13:28 +01:00 · 2023-03-09 14:13:28 +01:00 · 07b094729e
commit 07b094729e
parent a0ef5a4ae1
6 changed files with 66 additions and 5 deletions
--- a/test/pdfs/.gitignore
+++ b/test/pdfs/.gitignore
@ -576,3 +576,4 @@
 !bug1770750.pdf
 !issue16063.pdf
 !issue16067.pdf
+!bug1820909.1.pdf
--- a/test/pdfs/bug1820909.1.pdf
+++ b/test/pdfs/bug1820909.1.pdf
--- a/test/pdfs/bug1820909.pdf.link
+++ b/test/pdfs/bug1820909.pdf.link
@ -0,0 +1,2 @@
+https://web.archive.org/web/20221122204959/https://www.unicode.org/charts/PDF/U31350.pdf
+
--- a/test/test_manifest.json
+++ b/test/test_manifest.json
@ -7455,5 +7455,12 @@
      "rounds": 1,
      "link": true,
      "type": "eq"
+   },
+   {
+      "id": "bug1820909",
+      "file": "pdfs/bug1820909.pdf",
+      "md5": "d95a83a868671a03cbf322f16b2e2b9d",
+      "link": true,
+      "type": "other"
   }
 ]
--- a/test/unit/pdf_find_controller_spec.js
+++ b/test/unit/pdf_find_controller_spec.js
@ -854,4 +854,50 @@ describe("pdf_find_controller", function () {
      pageMatchesLength: [[7]],
    });
  });
+
+  it("performs a search in a text with some UTF-32 chars", async function () {
+    if (isNodeJS) {
+      pending("Linked test-cases are not supported in Node.js.");
+    }
+
+    const { eventBus, pdfFindController } = await initPdfFindController(
+      "bug1820909.pdf"
+    );
+
+    await testSearch({
+      eventBus,
+      pdfFindController,
+      state: {
+        query: "31350",
+      },
+      matchesPerPage: [1, 2],
+      selectedMatch: {
+        pageIndex: 0,
+        matchIndex: 0,
+      },
+      pageMatches: [[41], [131, 1359]],
+      pageMatchesLength: [[5], [5, 5]],
+    });
+  });
+
+  it("performs a search in a text with some UTF-32 chars followed by a dash at the end of a line", async function () {
+    const { eventBus, pdfFindController } = await initPdfFindController(
+      "bug1820909.1.pdf"
+    );
+
+    await testSearch({
+      eventBus,
+      pdfFindController,
+      state: {
+        query: "abcde",
+      },
+      matchesPerPage: [2],
+      selectedMatch: {
+        pageIndex: 0,
+        matchIndex: 0,
+      },
+      pageMatches: [[42, 95]],
+      pageMatchesLength: [[5, 5]],
+    });
+  });
 });
--- a/web/pdf_find_controller.js
+++ b/web/pdf_find_controller.js
@ -289,21 +289,26 @@ function normalize(text) {
        // "X-\n" is removed because an hyphen at the end of a line
        // with not a space before is likely here to mark a break
        // in a word.
-        // The \n isn't in the original text so here y = i, n = 1 and o = 2.
-        positions.push([i - shift + 1, 1 + shift]);
+        // If X is encoded with UTF-32 then it can have a length greater than 1.
+        // The \n isn't in the original text so here y = i, n = X.len - 2 and
+        // o = X.len - 1.
+        const len = p5.length - 2;
+        positions.push([i - shift + len, 1 + shift]);
        shift += 1;
        shiftOrigin += 1;
        eol += 1;
-        return p5.charAt(0);
+        return p5.slice(0, -2);
      }

      if (p6) {
        // An ideographic at the end of a line doesn't imply adding an extra
        // white space.
-        positions.push([i - shift + 1, shift]);
+        // A CJK can be encoded in UTF-32, hence their length isn't always 1.
+        const len = p6.length - 1;
+        positions.push([i - shift + len, shift]);
        shiftOrigin += 1;
        eol += 1;
-        return p6.charAt(0);
+        return p6.slice(0, -1);
      }

      if (p7) {
				`@ -0,0 +1,2 @@`
				`https://web.archive.org/web/20221122204959/https://www.unicode.org/charts/PDF/U31350.pdf`