Don't add an extra space after a Katakana or a Hiragana at the eol when searching

2022-11-29 10:46:48 +01:00 · 2022-11-29 10:46:48 +01:00 · ea1995991b
commit ea1995991b
parent 44bc315444
4 changed files with 27 additions and 1 deletions
--- a/test/pdfs/.gitignore
+++ b/test/pdfs/.gitignore
@ -557,3 +557,4 @@
 !freetext_no_appearance.pdf
 !issue15690.pdf
 !bug1802888.pdf
 !issue15759.pdf
--- a/test/pdfs/issue15759.pdf
+++ b/test/pdfs/issue15759.pdf
--- a/test/unit/pdf_find_controller_spec.js
+++ b/test/unit/pdf_find_controller_spec.js
@ -668,4 +668,25 @@ describe("pdf_find_controller", function () {
      pageMatchesLength: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
    });
  });
  it("performs a search in a text with some Katakana at the end of a line", async function () {
    const { eventBus, pdfFindController } = await initPdfFindController(
      "issue15759.pdf"
    );
    await testSearch({
      eventBus,
      pdfFindController,
      state: {
        query: "ソレノイド",
      },
      matchesPerPage: [1],
      selectedMatch: {
        pageIndex: 0,
        matchIndex: 0,
      },
      pageMatches: [[6]],
      pageMatchesLength: [[5]],
    });
  });
 });
--- a/web/pdf_find_controller.js
+++ b/web/pdf_find_controller.js
@ -132,7 +132,11 @@ function normalize(text) {
      "\u3244-\u32bf" + // Circled ideograms/numbers.
      "\u32d0-\u32fe" + // Circled ideograms.
      "\uff00-\uffef"; // Halfwidth, fullwidth forms.
-    const regexp = `([${replace}])|([${toNormalizeWithNFKC}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(\\p{Ideographic}\\n)|(\\n)`;
+
    // 3040-309F: Hiragana
    // 30A0-30FF: Katakana
    const CJK = "(?:\\p{Ideographic}|[\u3040-\u30FF])";
    const regexp = `([${replace}])|([${toNormalizeWithNFKC}])|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(${CJK}\\n)|(\\n)`;
    if (syllablePositions.length === 0) {
      // Most of the syllables belong to Hangul so there are no need