Merge pull request #13261 from calixteman/diacritics1

[api-minor] Support search with or without diacritics (bug 1508345, bug 916883, bug 1651113)
2022-02-03 16:30:47 +01:00 · 2022-02-03 16:30:47 +01:00 · 8281e64db3
commit 8281e64db3
parent 70073ed81c 1f41028fcb
12 changed files with 604 additions and 172 deletions
--- a/l10n/en-US/viewer.properties
+++ b/l10n/en-US/viewer.properties
@ -170,6 +170,7 @@ find_next.title=Find the next occurrence of the phrase
 find_next_label=Next
 find_highlight=Highlight all
 find_match_case_label=Match case
 find_match_diacritics_label=Match Diacritics
 find_entire_word_label=Whole words
 find_reached_top=Reached top of document, continued from bottom
 find_reached_bottom=Reached end of document, continued from top
--- a/test/pdfs/.gitignore
+++ b/test/pdfs/.gitignore
@ -357,6 +357,7 @@
 !issue4650.pdf
 !issue6721_reduced.pdf
 !issue3025.pdf
 !french_diacritics.pdf
 !issue2099-1.pdf
 !issue3371.pdf
 !issue2956.pdf
--- a/test/pdfs/french_diacritics.pdf
+++ b/test/pdfs/french_diacritics.pdf
--- a/test/unit/pdf_find_controller_spec.js
+++ b/test/unit/pdf_find_controller_spec.js
@ -86,6 +86,7 @@ function testSearch({
        entireWord: false,
        phraseSearch: true,
        findPrevious: false,
        matchDiacritics: false,
      },
      state
    );
@ -264,5 +265,293 @@ describe("pdf_find_controller", function () {
      pageMatches: [[19, 46, 62]],
      pageMatchesLength: [[8, 8, 8]],
    });
    await testSearch({
      eventBus,
      pdfFindController,
      state: {
        query: "1/2",
      },
      matchesPerPage: [2],
      selectedMatch: {
        pageIndex: 0,
        matchIndex: 0,
      },
      pageMatches: [[27, 54]],
      pageMatchesLength: [[1, 1]],
    });
    await testSearch({
      eventBus,
      pdfFindController,
      state: {
        query: "½",
      },
      matchesPerPage: [2],
      selectedMatch: {
        pageIndex: 0,
        matchIndex: 0,
      },
      pageMatches: [[27, 54]],
      pageMatchesLength: [[1, 1]],
    });
  });
  it("performs a normal search, where the text with diacritics is normalized", async function () {
    const { eventBus, pdfFindController } = await initPdfFindController(
      "french_diacritics.pdf"
    );
    await testSearch({
      eventBus,
      pdfFindController,
      state: {
        query: "a",
      },
      matchesPerPage: [6],
      selectedMatch: {
        pageIndex: 0,
        matchIndex: 0,
      },
      pageMatches: [[0, 2, 4, 6, 8, 10]],
      pageMatchesLength: [[1, 1, 1, 1, 1, 1]],
    });
    await testSearch({
      eventBus,
      pdfFindController,
      state: {
        query: "u",
      },
      matchesPerPage: [6],
      selectedMatch: {
        pageIndex: 0,
        matchIndex: 0,
      },
      pageMatches: [[44, 46, 48, 50, 52, 54]],
      pageMatchesLength: [[1, 1, 1, 1, 1, 1]],
    });
    await testSearch({
      eventBus,
      pdfFindController,
      state: {
        query: "ë",
        matchDiacritics: true,
      },
      matchesPerPage: [2],
      selectedMatch: {
        pageIndex: 0,
        matchIndex: 0,
      },
      pageMatches: [[28, 30]],
      pageMatchesLength: [[1, 1]],
    });
  });
  it("performs a search where one of the results contains an hyphen", async function () {
    const { eventBus, pdfFindController } = await initPdfFindController();
    await testSearch({
      eventBus,
      pdfFindController,
      state: {
        query: "optimiz",
      },
      matchesPerPage: [1, 4, 2, 3, 3, 0, 2, 9, 1, 0, 0, 6, 3, 4],
      selectedMatch: {
        pageIndex: 0,
        matchIndex: 0,
      },
    });
  });
  it("performs a search where the result is on two lines", async function () {
    const { eventBus, pdfFindController } = await initPdfFindController();
    await testSearch({
      eventBus,
      pdfFindController,
      state: {
        query: "user experience",
      },
      matchesPerPage: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      selectedMatch: {
        pageIndex: 0,
        matchIndex: 0,
      },
      pageMatches: [[2743]],
      pageMatchesLength: [[14]],
    });
  });
  it("performs a search where the result is on two lines with a punctuation at eol", async function () {
    const { eventBus, pdfFindController } = await initPdfFindController();
    await testSearch({
      eventBus,
      pdfFindController,
      state: {
        query: "version.the",
      },
      matchesPerPage: [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      selectedMatch: {
        pageIndex: 1,
        matchIndex: 0,
      },
      pageMatches: [[], [1493]],
      pageMatchesLength: [[], [11]],
    });
  });
  it("performs a search with a minus sign in the query", async function () {
    const { eventBus, pdfFindController } = await initPdfFindController();
    await testSearch({
      eventBus,
      pdfFindController,
      state: {
        query: "trace-based  just-in-time",
      },
      matchesPerPage: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
      selectedMatch: {
        pageIndex: 0,
        matchIndex: 0,
      },
      pageMatches: [
        [0],
        [],
        [],
        [],
        [],
        [],
        [],
        [],
        [],
        [],
        [],
        [],
        [],
        [2087],
      ],
      pageMatchesLength: [
        [24],
        [],
        [],
        [],
        [],
        [],
        [],
        [],
        [],
        [],
        [],
        [],
        [],
        [24],
      ],
    });
  });
  it("performs a search with square brackets in the query", async function () {
    const { eventBus, pdfFindController } = await initPdfFindController();
    await testSearch({
      eventBus,
      pdfFindController,
      state: {
        query: "[Programming Languages]",
      },
      matchesPerPage: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      selectedMatch: {
        pageIndex: 0,
        matchIndex: 0,
      },
      pageMatches: [[1501]],
      pageMatchesLength: [[25]],
    });
  });
  it("performs a search with parenthesis in the query", async function () {
    const { eventBus, pdfFindController } = await initPdfFindController();
    await testSearch({
      eventBus,
      pdfFindController,
      state: {
        query: "\t   (checks)",
      },
      matchesPerPage: [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      selectedMatch: {
        pageIndex: 1,
        matchIndex: 0,
      },
      pageMatches: [[], [201]],
      pageMatchesLength: [[], [9]],
    });
  });
  it("performs a search with a final dot in the query", async function () {
    const { eventBus, pdfFindController } = await initPdfFindController();
    // The whitespace after the dot mustn't be matched.
    const query = "complex applications.";
    await testSearch({
      eventBus,
      pdfFindController,
      state: {
        query,
      },
      matchesPerPage: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      selectedMatch: {
        pageIndex: 0,
        matchIndex: 0,
      },
      pageMatches: [[1946]],
      pageMatchesLength: [[21]],
    });
  });
  it("performs a search with a dot in the query and a missing whitespace", async function () {
    const { eventBus, pdfFindController } = await initPdfFindController();
    // The whitespace after the dot must be matched.
    const query = "complex applications.J";
    await testSearch({
      eventBus,
      pdfFindController,
      state: {
        query,
      },
      matchesPerPage: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      selectedMatch: {
        pageIndex: 0,
        matchIndex: 0,
      },
      pageMatches: [[1946]],
      pageMatchesLength: [[23]],
    });
  });
  it("performs a search with a dot followed by a whitespace in the query", async function () {
    const { eventBus, pdfFindController } = await initPdfFindController();
    const query = "complex applications. j";
    await testSearch({
      eventBus,
      pdfFindController,
      state: {
        query,
      },
      matchesPerPage: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      selectedMatch: {
        pageIndex: 0,
        matchIndex: 0,
      },
      pageMatches: [[1946]],
      pageMatchesLength: [[23]],
    });
  });
 });
--- a/test/unit/ui_utils_spec.js
+++ b/test/unit/ui_utils_spec.js
@ -51,6 +51,11 @@ describe("ui_utils", function () {
      expect(binarySearchFirstItem([2, 3, 4], isGreater3)).toEqual(2);
      expect(binarySearchFirstItem([4, 5, 6], isGreater3)).toEqual(0);
    });
    it("three numeric entries and a start index", function () {
      expect(binarySearchFirstItem([0, 1, 2, 3, 4], isGreater3, 2)).toEqual(4);
      expect(binarySearchFirstItem([2, 3, 4], isGreater3, 2)).toEqual(2);
      expect(binarySearchFirstItem([4, 5, 6], isGreater3, 1)).toEqual(1);
    });
  });
  describe("isValidRotation", function () {
--- a/web/app.js
+++ b/web/app.js
@ -2592,6 +2592,7 @@ function webViewerFindFromUrlHash(evt) {
    entireWord: false,
    highlightAll: true,
    findPrevious: false,
    matchDiacritics: true,
  });
 }
--- a/web/firefoxcom.js
+++ b/web/firefoxcom.js
@ -218,6 +218,7 @@ class MozL10n {
    "findcasesensitivitychange",
    "findentirewordchange",
    "findbarclose",
    "finddiacriticmatchingchange",
  ];
  const findLen = "find".length;
@ -238,6 +239,7 @@ class MozL10n {
      entireWord: !!detail.entireWord,
      highlightAll: !!detail.highlightAll,
      findPrevious: !!detail.findPrevious,
      matchDiacritics: !!detail.matchDiacritics,
    });
  };
--- a/web/pdf_find_bar.js
+++ b/web/pdf_find_bar.js
@ -32,6 +32,7 @@ class PDFFindBar {
    this.findField = options.findField;
    this.highlightAll = options.highlightAllCheckbox;
    this.caseSensitive = options.caseSensitiveCheckbox;
    this.matchDiacritics = options.matchDiacriticsCheckbox;
    this.entireWord = options.entireWordCheckbox;
    this.findMsg = options.findMsg;
    this.findResultsCount = options.findResultsCount;
@ -82,6 +83,10 @@ class PDFFindBar {
      this.dispatchEvent("entirewordchange");
    });
    this.matchDiacritics.addEventListener("click", () => {
      this.dispatchEvent("diacriticmatchingchange");
    });
    this.eventBus._on("resize", this._adjustWidth.bind(this));
  }
@ -99,6 +104,7 @@ class PDFFindBar {
      entireWord: this.entireWord.checked,
      highlightAll: this.highlightAll.checked,
      findPrevious: findPrev,
      matchDiacritics: this.matchDiacritics.checked,
    });
  }
--- a/web/pdf_find_controller.js
+++ b/web/pdf_find_controller.js
@ -17,9 +17,9 @@
 /** @typedef {import("./event_utils").EventBus} EventBus */
 /** @typedef {import("./interfaces").IPDFLinkService} IPDFLinkService */
 import { binarySearchFirstItem, scrollIntoView } from "./ui_utils.js";
 import { createPromiseCapability } from "pdfjs-lib";
 import { getCharacterType } from "./pdf_find_utils.js";
 import { scrollIntoView } from "./ui_utils.js";
 const FindState = {
  FOUND: 0,
@ -47,47 +47,184 @@ const CHARACTERS_TO_NORMALIZE = {
  "\u00BE": "3/4", // Vulgar fraction three quarters
 };
 // These diacritics aren't considered as combining diacritics
 // when searching in a document:
 //   https://searchfox.org/mozilla-central/source/intl/unicharutil/util/is_combining_diacritic.py.
 // The combining class definitions can be found:
 //   https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values
 // Category 0 corresponds to [^\p{Mn}].
 const DIACRITICS_EXCEPTION = new Set([
  // UNICODE_COMBINING_CLASS_KANA_VOICING
  // https://www.compart.com/fr/unicode/combining/8
  0x3099, 0x309a,
  // UNICODE_COMBINING_CLASS_VIRAMA (under 0xFFFF)
  // https://www.compart.com/fr/unicode/combining/9
  0x094d, 0x09cd, 0x0a4d, 0x0acd, 0x0b4d, 0x0bcd, 0x0c4d, 0x0ccd, 0x0d3b,
  0x0d3c, 0x0d4d, 0x0dca, 0x0e3a, 0x0eba, 0x0f84, 0x1039, 0x103a, 0x1714,
  0x1734, 0x17d2, 0x1a60, 0x1b44, 0x1baa, 0x1bab, 0x1bf2, 0x1bf3, 0x2d7f,
  0xa806, 0xa82c, 0xa8c4, 0xa953, 0xa9c0, 0xaaf6, 0xabed,
  // 91
  // https://www.compart.com/fr/unicode/combining/91
  0x0c56,
  // 129
  // https://www.compart.com/fr/unicode/combining/129
  0x0f71,
  // 130
  // https://www.compart.com/fr/unicode/combining/130
  0x0f72, 0x0f7a, 0x0f7b, 0x0f7c, 0x0f7d, 0x0f80,
  // 132
  // https://www.compart.com/fr/unicode/combining/132
  0x0f74,
 ]);
 const DIACRITICS_EXCEPTION_STR = [...DIACRITICS_EXCEPTION.values()]
  .map(x => String.fromCharCode(x))
  .join("");
 const DIACRITICS_REG_EXP = /\p{M}+/gu;
 const SPECIAL_CHARS_REG_EXP =
  /([.*+?^${}()|[\]\\])|(\p{P})|(\s+)|(\p{M})|(\p{L})/gu;
 const NOT_DIACRITIC_FROM_END_REG_EXP = /([^\p{M}])\p{M}*$/u;
 const NOT_DIACRITIC_FROM_START_REG_EXP = /^\p{M}*([^\p{M}])/u;
 let normalizationRegex = null;
 function normalize(text) {
  // The diacritics in the text or in the query can be composed or not.
  // So we use a decomposed text using NFD (and the same for the query)
  // in order to be sure that diacritics are in the same order.
  if (!normalizationRegex) {
    // Compile the regular expression for text normalization once.
    const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join("");
-    normalizationRegex = new RegExp(`[${replace}]`, "g");
+    normalizationRegex = new RegExp(
      `([${replace}])|(\\S-\\n)|(\\n)|(\\p{M}+)`,
      "gum"
    );
  }
  let diffs = null;
  const normalizedText = text.replace(normalizationRegex, function (ch, index) {
    const normalizedCh = CHARACTERS_TO_NORMALIZE[ch],
      diff = normalizedCh.length - ch.length;
    if (diff !== 0) {
      (diffs ||= []).push([index, diff]);
    }
    return normalizedCh;
  });
-  return [normalizedText, diffs];
+  // The goal of this function is to normalize the string and
  // be able to get from an index in the new string the
  // corresponding index in the old string.
  // For example if we have: abCd12ef456gh where C is replaced by ccc
  // and numbers replaced by nothing (it's the case for diacritics), then
  // we'll obtain the normalized string: abcccdefgh.
  // So here the reverse map is: [0,1,2,2,2,3,6,7,11,12].
  // The goal is to obtain the array: [[0, 0], [3, -1], [4, -2],
  // [6, 0], [8, 3]].
  // which can be used like this:
  //  - let say that i is the index in new string and j the index
  //    the old string.
  //  - if i is in [0; 3[ then j = i + 0
  //  - if i is in [3; 4[ then j = i - 1
  //  - if i is in [4; 6[ then j = i - 2
  //  ...
  // Thanks to a binary search it's easy to know where is i and what's the
  // shift.
  // Let say that the last entry in the array is [x, s] and we have a
  // substitution at index y (old string) which will replace o chars by n chars.
  // Firstly, if o === n, then no need to add a new entry: the shift is
  // the same.
  // Secondly, if o < n, then we push the n - o elements:
  // [y - (s - 1), s - 1], [y - (s - 2), s - 2], ...
  // Thirdly, if o > n, then we push the element: [y - (s - n), o + s - n]
  // Collect diacritics length and positions.
  const rawDiacriticsPositions = [];
  let m;
  while ((m = DIACRITICS_REG_EXP.exec(text)) !== null) {
    rawDiacriticsPositions.push([m[0].length, m.index]);
  }
  let normalized = text.normalize("NFD");
  const positions = [[0, 0]];
  let k = 0;
  let shift = 0;
  let shiftOrigin = 0;
  let eol = 0;
  let hasDiacritics = false;
  normalized = normalized.replace(
    normalizationRegex,
    (match, p1, p2, p3, p4, i) => {
      i -= shiftOrigin;
      if (p1) {
        // Maybe fractions or quotations mark...
        const replacement = CHARACTERS_TO_NORMALIZE[match];
        const jj = replacement.length;
        for (let j = 1; j < jj; j++) {
          positions.push([i - shift + j, shift - j]);
        }
        shift -= jj - 1;
        return replacement;
      }
      if (p2) {
        // "X-\n" is removed because an hyphen at the end of a line
        // with not a space before is likely here to mark a break
        // in a word.
        positions.push([i - shift, 1 + shift]);
        shift += 1;
        shiftOrigin += 1;
        eol += 1;
        return p2.charAt(0);
      }
      if (p3) {
        // eol is replaced by space: "foo\nbar" is likely equivalent to
        // "foo bar".
        positions.push([i - shift + 1, shift - 1]);
        shift -= 1;
        shiftOrigin += 1;
        eol += 1;
        return " ";
      }
      // Diacritics.
      hasDiacritics = true;
      let jj = p4.length;
      if (i + eol === rawDiacriticsPositions[k]?.[1]) {
        jj -= rawDiacriticsPositions[k][0];
        ++k;
      }
      for (let j = 1; j < jj + 1; j++) {
        // i is the position of the first diacritic
        // so (i - 1) is the position for the letter before.
        positions.push([i - 1 - shift + j, shift - j]);
      }
      shift -= jj;
      shiftOrigin += jj;
      return p4;
    }
  );
  positions.push([normalized.length, shift]);
  return [normalized, positions, hasDiacritics];
 }
 // Determine the original, non-normalized, match index such that highlighting of
 // search results is correct in the `textLayer` for strings containing e.g. "½"
 // characters; essentially "inverting" the result of the `normalize` function.
-function getOriginalIndex(matchIndex, diffs = null) {
+function getOriginalIndex(diffs, pos, len) {
  if (!diffs) {
-    return matchIndex;
+    return [pos, len];
  }
  let totalDiff = 0;
  for (const [index, diff] of diffs) {
    const currentIndex = index + totalDiff;
-    if (currentIndex >= matchIndex) {
+  const start = pos;
-      break;
+  const end = pos + len;
-    }
+  let i = binarySearchFirstItem(diffs, x => x[0] >= start);
-    if (currentIndex + diff > matchIndex) {
+  if (diffs[i][0] > start) {
-      totalDiff += matchIndex - currentIndex;
+    --i;
      break;
    }
    totalDiff += diff;
  }
-  return matchIndex - totalDiff;
+
  let j = binarySearchFirstItem(diffs, x => x[0] >= end, i);
  if (diffs[j][0] > end) {
    --j;
  }
  return [start + diffs[i][1], len + diffs[j][1] - diffs[i][1]];
 }
 /**
@ -276,6 +413,7 @@ class PDFFindController {
    this._extractTextPromises = [];
    this._pageContents = []; // Stores the normalized text for each page.
    this._pageDiffs = [];
    this._hasDiacritics = [];
    this._matchesCountTotal = 0;
    this._pagesToSearch = null;
    this._pendingFindMatches = new Set();
@ -331,192 +469,169 @@ class PDFFindController {
    return true;
  }
  /**
   * Helper for multi-term search that fills the `matchesWithLength` array
   * and handles cases where one search term includes another search term (for
   * example, "tamed tame" or "this is"). It looks for intersecting terms in
   * the `matches` and keeps elements with a longer match length.
   */
  _prepareMatches(matchesWithLength, matches, matchesLength) {
    function isSubTerm(currentIndex) {
      const currentElem = matchesWithLength[currentIndex];
      const nextElem = matchesWithLength[currentIndex + 1];
      // Check for cases like "TAMEd TAME".
      if (
        currentIndex < matchesWithLength.length - 1 &&
        currentElem.match === nextElem.match
      ) {
        currentElem.skipped = true;
        return true;
      }
      // Check for cases like "thIS IS".
      for (let i = currentIndex - 1; i >= 0; i--) {
        const prevElem = matchesWithLength[i];
        if (prevElem.skipped) {
          continue;
        }
        if (prevElem.match + prevElem.matchLength < currentElem.match) {
          break;
        }
        if (
          prevElem.match + prevElem.matchLength >=
          currentElem.match + currentElem.matchLength
        ) {
          currentElem.skipped = true;
          return true;
        }
      }
      return false;
    }
    // Sort the array of `{ match: <match>, matchLength: <matchLength> }`
    // objects on increasing index first and on the length otherwise.
    matchesWithLength.sort(function (a, b) {
      return a.match === b.match
        ? a.matchLength - b.matchLength
        : a.match - b.match;
    });
    for (let i = 0, len = matchesWithLength.length; i < len; i++) {
      if (isSubTerm(i)) {
        continue;
      }
      matches.push(matchesWithLength[i].match);
      matchesLength.push(matchesWithLength[i].matchLength);
    }
  }
  /**
   * Determine if the search query constitutes a "whole word", by comparing the
   * first/last character type with the preceding/following character type.
   */
  _isEntireWord(content, startIdx, length) {
-    if (startIdx > 0) {
+    let match = content
      .slice(0, startIdx)
      .match(NOT_DIACRITIC_FROM_END_REG_EXP);
    if (match) {
      const first = content.charCodeAt(startIdx);
-      const limit = content.charCodeAt(startIdx - 1);
+      const limit = match[1].charCodeAt(0);
      if (getCharacterType(first) === getCharacterType(limit)) {
        return false;
      }
    }
-    const endIdx = startIdx + length - 1;
+
-    if (endIdx < content.length - 1) {
+    match = content
-      const last = content.charCodeAt(endIdx);
+      .slice(startIdx + length)
-      const limit = content.charCodeAt(endIdx + 1);
+      .match(NOT_DIACRITIC_FROM_START_REG_EXP);
    if (match) {
      const last = content.charCodeAt(startIdx + length - 1);
      const limit = match[1].charCodeAt(0);
      if (getCharacterType(last) === getCharacterType(limit)) {
        return false;
      }
    }
    return true;
  }
-  _calculatePhraseMatch(query, pageIndex, pageContent, pageDiffs, entireWord) {
+  _calculateRegExpMatch(query, entireWord, pageIndex, pageContent) {
    const matches = [],
      matchesLength = [];
    const queryLen = query.length;
-    let matchIdx = -queryLen;
+    const diffs = this._pageDiffs[pageIndex];
-    while (true) {
+    let match;
-      matchIdx = pageContent.indexOf(query, matchIdx + queryLen);
+    while ((match = query.exec(pageContent)) !== null) {
-      if (matchIdx === -1) {
+      if (
-        break;
+        entireWord &&
-      }
+        !this._isEntireWord(pageContent, match.index, match[0].length)
-      if (entireWord && !this._isEntireWord(pageContent, matchIdx, queryLen)) {
+      ) {
        continue;
      }
      const originalMatchIdx = getOriginalIndex(matchIdx, pageDiffs),
        matchEnd = matchIdx + queryLen - 1,
        originalQueryLen =
          getOriginalIndex(matchEnd, pageDiffs) - originalMatchIdx + 1;
-      matches.push(originalMatchIdx);
+      const [matchPos, matchLen] = getOriginalIndex(
-      matchesLength.push(originalQueryLen);
+        diffs,
        match.index,
        match[0].length
      );
      if (matchLen) {
        matches.push(matchPos);
        matchesLength.push(matchLen);
      }
    }
    this._pageMatches[pageIndex] = matches;
    this._pageMatchesLength[pageIndex] = matchesLength;
  }
-  _calculateWordMatch(query, pageIndex, pageContent, pageDiffs, entireWord) {
+  _convertToRegExpString(query, hasDiacritics) {
-    const matchesWithLength = [];
+    const { matchDiacritics } = this._state;
    let isUnicode = false;
    query = query.replace(
      SPECIAL_CHARS_REG_EXP,
      (
        match,
        p1 /* to escape */,
        p2 /* punctuation */,
        p3 /* whitespaces */,
        p4 /* diacritics */,
        p5 /* letters */
      ) => {
        // We don't need to use a \s for whitespaces since all the different
        // kind of whitespaces are replaced by a single " ".
-    // Divide the query into pieces and search for text in each piece.
+        if (p1) {
-    const queryArray = query.match(/\S+/g);
+          // Escape characters like *+?... to not interfer with regexp syntax.
-    for (let i = 0, len = queryArray.length; i < len; i++) {
+          return `[ ]*\\${p1}[ ]*`;
      const subquery = queryArray[i];
      const subqueryLen = subquery.length;
      let matchIdx = -subqueryLen;
      while (true) {
        matchIdx = pageContent.indexOf(subquery, matchIdx + subqueryLen);
        if (matchIdx === -1) {
          break;
        }
-        if (
+        if (p2) {
-          entireWord &&
+          // Allow whitespaces around punctuation signs.
-          !this._isEntireWord(pageContent, matchIdx, subqueryLen)
+          return `[ ]*${p2}[ ]*`;
-        ) {
+        }
-          continue;
+        if (p3) {
          // Replace spaces by \s+ to be sure to match any spaces.
          return "[ ]+";
        }
        if (matchDiacritics) {
          return p4 || p5;
        }
        const originalMatchIdx = getOriginalIndex(matchIdx, pageDiffs),
          matchEnd = matchIdx + subqueryLen - 1,
          originalQueryLen =
            getOriginalIndex(matchEnd, pageDiffs) - originalMatchIdx + 1;
-        // Other searches do not, so we store the length.
+        if (p4) {
-        matchesWithLength.push({
+          // Diacritics are removed with few exceptions.
-          match: originalMatchIdx,
+          return DIACRITICS_EXCEPTION.has(p4.charCodeAt(0)) ? p4 : "";
-          matchLength: originalQueryLen,
+        }
-          skipped: false,
+
-        });
+        // A letter has been matched and it can be followed by any diacritics
        // in normalized text.
        if (hasDiacritics) {
          isUnicode = true;
          return `${p5}\\p{M}*`;
        }
        return p5;
      }
    );
    const trailingSpaces = "[ ]*";
    if (query.endsWith(trailingSpaces)) {
      // The [ ]* has been added in order to help to match "foo . bar" but
      // it doesn't make sense to match some whitespaces after the dot
      // when it's the last character.
      query = query.slice(0, query.length - trailingSpaces.length);
    }
    if (matchDiacritics) {
      // aX must not match aXY.
      if (hasDiacritics) {
        isUnicode = true;
        query = `${query}(?=[${DIACRITICS_EXCEPTION_STR}]|[^\\p{M}]|$)`;
      }
    }
-    // Prepare arrays for storing the matches.
+    return [isUnicode, query];
    this._pageMatchesLength[pageIndex] = [];
    this._pageMatches[pageIndex] = [];
    // Sort `matchesWithLength`, remove intersecting terms and put the result
    // into the two arrays.
    this._prepareMatches(
      matchesWithLength,
      this._pageMatches[pageIndex],
      this._pageMatchesLength[pageIndex]
    );
  }
  _calculateMatch(pageIndex) {
    let pageContent = this._pageContents[pageIndex];
    const pageDiffs = this._pageDiffs[pageIndex];
    let query = this._query;
    const { caseSensitive, entireWord, phraseSearch } = this._state;
    if (query.length === 0) {
      // Do nothing: the matches should be wiped out already.
      return;
    }
-    if (!caseSensitive) {
+    const { caseSensitive, entireWord, phraseSearch } = this._state;
-      pageContent = pageContent.toLowerCase();
+    const pageContent = this._pageContents[pageIndex];
-      query = query.toLowerCase();
+    const hasDiacritics = this._hasDiacritics[pageIndex];
    let isUnicode = false;
    if (phraseSearch) {
      [isUnicode, query] = this._convertToRegExpString(query, hasDiacritics);
    } else {
      // Words are sorted in reverse order to be sure that "foobar" is matched
      // before "foo" in case the query is "foobar foo".
      const match = query.match(/\S+/g);
      if (match) {
        query = match
          .sort()
          .reverse()
          .map(q => {
            const [isUnicodePart, queryPart] = this._convertToRegExpString(
              q,
              hasDiacritics
            );
            isUnicode ||= isUnicodePart;
            return `(${queryPart})`;
          })
          .join("|");
      }
    }
-    if (phraseSearch) {
+    const flags = `g${isUnicode ? "u" : ""}${caseSensitive ? "" : "i"}`;
-      this._calculatePhraseMatch(
+    query = new RegExp(query, flags);
-        query,
+
-        pageIndex,
+    this._calculateRegExpMatch(query, entireWord, pageIndex, pageContent);
        pageContent,
        pageDiffs,
        entireWord
      );
    } else {
      this._calculateWordMatch(
        query,
        pageIndex,
        pageContent,
        pageDiffs,
        entireWord
      );
    }
    // When `highlightAll` is set, ensure that the matches on previously
    // rendered (and still active) pages are correctly highlighted.
@ -560,12 +675,17 @@ class PDFFindController {
              for (let j = 0, jj = textItems.length; j < jj; j++) {
                strBuf.push(textItems[j].str);
                if (textItems[j].hasEOL) {
                  strBuf.push("\n");
                }
              }
              // Store the normalized page content (text items) as one string.
-              [this._pageContents[i], this._pageDiffs[i]] = normalize(
+              [
-                strBuf.join("")
+                this._pageContents[i],
-              );
+                this._pageDiffs[i],
                this._hasDiacritics[i],
              ] = normalize(strBuf.join(""));
              extractTextCapability.resolve(i);
            },
            reason => {
@ -576,6 +696,7 @@ class PDFFindController {
              // Page error -- assuming no text content.
              this._pageContents[i] = "";
              this._pageDiffs[i] = null;
              this._hasDiacritics[i] = false;
              extractTextCapability.resolve(i);
            }
          );
--- a/web/ui_utils.js
+++ b/web/ui_utils.js
@ -227,8 +227,8 @@ function removeNullCharacters(str, replaceInvisible = false) {
 * @returns {number} Index of the first array element to pass the test,
 *                   or |items.length| if no such element exists.
 */
-function binarySearchFirstItem(items, condition) {
+function binarySearchFirstItem(items, condition, start = 0) {
-  let minIndex = 0;
+  let minIndex = start;
  let maxIndex = items.length - 1;
  if (maxIndex < 0 || !condition(items[maxIndex])) {
--- a/web/viewer.html
+++ b/web/viewer.html
@ -135,8 +135,13 @@ See https://github.com/adobe-type-tools/cmap-resources
            <label for="findMatchCase" class="toolbarLabel" data-l10n-id="find_match_case_label">Match case</label>
          </div>
          <div id="findbarOptionsTwoContainer">
-            <input type="checkbox" id="findEntireWord" class="toolbarField" tabindex="96">
+            <input type="checkbox" id="findMatchDiacritics" class="toolbarField" tabindex="96">
            <label for="findMatchDiacritics" class="toolbarLabel" data-l10n-id="find_match_diacritics_label">Match Diacritics</label>
            <input type="checkbox" id="findEntireWord" class="toolbarField" tabindex="97">
            <label for="findEntireWord" class="toolbarLabel" data-l10n-id="find_entire_word_label">Whole words</label>
          </div>
          <div id="findbarOptionsThreeContainer">
            <span id="findResultsCount" class="toolbarLabel hidden"></span>
          </div>
--- a/web/viewer.js
+++ b/web/viewer.js
@ -153,6 +153,7 @@ function getViewerConfiguration() {
      findField: document.getElementById("findInput"),
      highlightAllCheckbox: document.getElementById("findHighlightAll"),
      caseSensitiveCheckbox: document.getElementById("findMatchCase"),
      matchDiacriticsCheckbox: document.getElementById("findMatchDiacritics"),
      entireWordCheckbox: document.getElementById("findEntireWord"),
      findMsg: document.getElementById("findMsg"),
      findResultsCount: document.getElementById("findResultsCount"),