0e19c3a120
*Please note:* This patch only extends the `PDFFindController` implementation itself to support this functionality, however it's *purposely* not exposed in the default viewer. This replaces the previous `phraseSearch`-parameter, and a `query`-string will now always be interpreted as a phrase-search. To enable searching for individual words, the `query`-parameter must instead consist of an Array of strings. This way it's now also possible to combine phrase/word searches, with a `query`-parameter looking something like `["Lorem ipsum", "foo", "bar"]` which will search for the phrase "Lorem ipsum" *and* the words "foo" respectively "bar".
1147 lines
36 KiB
JavaScript
1147 lines
36 KiB
JavaScript
/* Copyright 2012 Mozilla Foundation
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
/** @typedef {import("../src/display/api").PDFDocumentProxy} PDFDocumentProxy */
|
|
/** @typedef {import("./event_utils").EventBus} EventBus */
|
|
/** @typedef {import("./interfaces").IPDFLinkService} IPDFLinkService */
|
|
|
|
import { binarySearchFirstItem, scrollIntoView } from "./ui_utils.js";
|
|
import { createPromiseCapability } from "pdfjs-lib";
|
|
import { getCharacterType } from "./pdf_find_utils.js";
|
|
|
|
const FindState = {
|
|
FOUND: 0,
|
|
NOT_FOUND: 1,
|
|
WRAPPED: 2,
|
|
PENDING: 3,
|
|
};
|
|
|
|
const FIND_TIMEOUT = 250; // ms
|
|
const MATCH_SCROLL_OFFSET_TOP = -50; // px
|
|
const MATCH_SCROLL_OFFSET_LEFT = -400; // px
|
|
|
|
const CHARACTERS_TO_NORMALIZE = {
|
|
"\u2010": "-", // Hyphen
|
|
"\u2018": "'", // Left single quotation mark
|
|
"\u2019": "'", // Right single quotation mark
|
|
"\u201A": "'", // Single low-9 quotation mark
|
|
"\u201B": "'", // Single high-reversed-9 quotation mark
|
|
"\u201C": '"', // Left double quotation mark
|
|
"\u201D": '"', // Right double quotation mark
|
|
"\u201E": '"', // Double low-9 quotation mark
|
|
"\u201F": '"', // Double high-reversed-9 quotation mark
|
|
"\u00BC": "1/4", // Vulgar fraction one quarter
|
|
"\u00BD": "1/2", // Vulgar fraction one half
|
|
"\u00BE": "3/4", // Vulgar fraction three quarters
|
|
};
|
|
|
|
// These diacritics aren't considered as combining diacritics
|
|
// when searching in a document:
|
|
// https://searchfox.org/mozilla-central/source/intl/unicharutil/util/is_combining_diacritic.py.
|
|
// The combining class definitions can be found:
|
|
// https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values
|
|
// Category 0 corresponds to [^\p{Mn}].
|
|
const DIACRITICS_EXCEPTION = new Set([
|
|
// UNICODE_COMBINING_CLASS_KANA_VOICING
|
|
// https://www.compart.com/fr/unicode/combining/8
|
|
0x3099, 0x309a,
|
|
// UNICODE_COMBINING_CLASS_VIRAMA (under 0xFFFF)
|
|
// https://www.compart.com/fr/unicode/combining/9
|
|
0x094d, 0x09cd, 0x0a4d, 0x0acd, 0x0b4d, 0x0bcd, 0x0c4d, 0x0ccd, 0x0d3b,
|
|
0x0d3c, 0x0d4d, 0x0dca, 0x0e3a, 0x0eba, 0x0f84, 0x1039, 0x103a, 0x1714,
|
|
0x1734, 0x17d2, 0x1a60, 0x1b44, 0x1baa, 0x1bab, 0x1bf2, 0x1bf3, 0x2d7f,
|
|
0xa806, 0xa82c, 0xa8c4, 0xa953, 0xa9c0, 0xaaf6, 0xabed,
|
|
// 91
|
|
// https://www.compart.com/fr/unicode/combining/91
|
|
0x0c56,
|
|
// 129
|
|
// https://www.compart.com/fr/unicode/combining/129
|
|
0x0f71,
|
|
// 130
|
|
// https://www.compart.com/fr/unicode/combining/130
|
|
0x0f72, 0x0f7a, 0x0f7b, 0x0f7c, 0x0f7d, 0x0f80,
|
|
// 132
|
|
// https://www.compart.com/fr/unicode/combining/132
|
|
0x0f74,
|
|
]);
|
|
let DIACRITICS_EXCEPTION_STR; // Lazily initialized, see below.
|
|
|
|
const DIACRITICS_REG_EXP = /\p{M}+/gu;
|
|
const SPECIAL_CHARS_REG_EXP =
|
|
/([.*+?^${}()|[\]\\])|(\p{P})|(\s+)|(\p{M})|(\p{L})/gu;
|
|
const NOT_DIACRITIC_FROM_END_REG_EXP = /([^\p{M}])\p{M}*$/u;
|
|
const NOT_DIACRITIC_FROM_START_REG_EXP = /^\p{M}*([^\p{M}])/u;
|
|
|
|
// The range [AC00-D7AF] corresponds to the Hangul syllables.
|
|
// The few other chars are some CJK Compatibility Ideographs.
|
|
const SYLLABLES_REG_EXP = /[\uAC00-\uD7AF\uFA6C\uFACF-\uFAD1\uFAD5-\uFAD7]+/g;
|
|
const SYLLABLES_LENGTHS = new Map();
|
|
// When decomposed (in using NFD) the above syllables will start
|
|
// with one of the chars in this regexp.
|
|
const FIRST_CHAR_SYLLABLES_REG_EXP =
|
|
"[\\u1100-\\u1112\\ud7a4-\\ud7af\\ud84a\\ud84c\\ud850\\ud854\\ud857\\ud85f]";
|
|
|
|
const NFKC_CHARS_TO_NORMALIZE = new Map();
|
|
|
|
let noSyllablesRegExp = null;
|
|
let withSyllablesRegExp = null;
|
|
|
|
function normalize(text) {
|
|
// The diacritics in the text or in the query can be composed or not.
|
|
// So we use a decomposed text using NFD (and the same for the query)
|
|
// in order to be sure that diacritics are in the same order.
|
|
|
|
// Collect syllables length and positions.
|
|
const syllablePositions = [];
|
|
let m;
|
|
while ((m = SYLLABLES_REG_EXP.exec(text)) !== null) {
|
|
let { index } = m;
|
|
for (const char of m[0]) {
|
|
let len = SYLLABLES_LENGTHS.get(char);
|
|
if (!len) {
|
|
len = char.normalize("NFD").length;
|
|
SYLLABLES_LENGTHS.set(char, len);
|
|
}
|
|
syllablePositions.push([len, index++]);
|
|
}
|
|
}
|
|
|
|
let normalizationRegex;
|
|
if (syllablePositions.length === 0 && noSyllablesRegExp) {
|
|
normalizationRegex = noSyllablesRegExp;
|
|
} else if (syllablePositions.length > 0 && withSyllablesRegExp) {
|
|
normalizationRegex = withSyllablesRegExp;
|
|
} else {
|
|
// Compile the regular expression for text normalization once.
|
|
const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join("");
|
|
const toNormalizeWithNFKC =
|
|
"\u2460-\u2473" + // Circled numbers.
|
|
"\u24b6-\u24ff" + // Circled letters/numbers.
|
|
"\u3244-\u32bf" + // Circled ideograms/numbers.
|
|
"\u32d0-\u32fe" + // Circled ideograms.
|
|
"\uff00-\uffef"; // Halfwidth, fullwidth forms.
|
|
|
|
// 3040-309F: Hiragana
|
|
// 30A0-30FF: Katakana
|
|
const CJK = "(?:\\p{Ideographic}|[\u3040-\u30FF])";
|
|
const HKDiacritics = "(?:\u3099|\u309A)";
|
|
const regexp = `([${replace}])|([${toNormalizeWithNFKC}])|(${HKDiacritics}\\n)|(\\p{M}+(?:-\\n)?)|(\\S-\\n)|(${CJK}\\n)|(\\n)`;
|
|
|
|
if (syllablePositions.length === 0) {
|
|
// Most of the syllables belong to Hangul so there are no need
|
|
// to search for them in a non-Hangul document.
|
|
// We use the \0 in order to have the same number of groups.
|
|
normalizationRegex = noSyllablesRegExp = new RegExp(
|
|
regexp + "|(\\u0000)",
|
|
"gum"
|
|
);
|
|
} else {
|
|
normalizationRegex = withSyllablesRegExp = new RegExp(
|
|
regexp + `|(${FIRST_CHAR_SYLLABLES_REG_EXP})`,
|
|
"gum"
|
|
);
|
|
}
|
|
}
|
|
|
|
// The goal of this function is to normalize the string and
|
|
// be able to get from an index in the new string the
|
|
// corresponding index in the old string.
|
|
// For example if we have: abCd12ef456gh where C is replaced by ccc
|
|
// and numbers replaced by nothing (it's the case for diacritics), then
|
|
// we'll obtain the normalized string: abcccdefgh.
|
|
// So here the reverse map is: [0,1,2,2,2,3,6,7,11,12].
|
|
|
|
// The goal is to obtain the array: [[0, 0], [3, -1], [4, -2],
|
|
// [6, 0], [8, 3]].
|
|
// which can be used like this:
|
|
// - let say that i is the index in new string and j the index
|
|
// the old string.
|
|
// - if i is in [0; 3[ then j = i + 0
|
|
// - if i is in [3; 4[ then j = i - 1
|
|
// - if i is in [4; 6[ then j = i - 2
|
|
// ...
|
|
// Thanks to a binary search it's easy to know where is i and what's the
|
|
// shift.
|
|
// Let say that the last entry in the array is [x, s] and we have a
|
|
// substitution at index y (old string) which will replace o chars by n chars.
|
|
// Firstly, if o === n, then no need to add a new entry: the shift is
|
|
// the same.
|
|
// Secondly, if o < n, then we push the n - o elements:
|
|
// [y - (s - 1), s - 1], [y - (s - 2), s - 2], ...
|
|
// Thirdly, if o > n, then we push the element: [y - (s - n), o + s - n]
|
|
|
|
// Collect diacritics length and positions.
|
|
const rawDiacriticsPositions = [];
|
|
while ((m = DIACRITICS_REG_EXP.exec(text)) !== null) {
|
|
rawDiacriticsPositions.push([m[0].length, m.index]);
|
|
}
|
|
|
|
let normalized = text.normalize("NFD");
|
|
const positions = [[0, 0]];
|
|
let rawDiacriticsIndex = 0;
|
|
let syllableIndex = 0;
|
|
let shift = 0;
|
|
let shiftOrigin = 0;
|
|
let eol = 0;
|
|
let hasDiacritics = false;
|
|
|
|
normalized = normalized.replace(
|
|
normalizationRegex,
|
|
(match, p1, p2, p3, p4, p5, p6, p7, p8, i) => {
|
|
i -= shiftOrigin;
|
|
if (p1) {
|
|
// Maybe fractions or quotations mark...
|
|
const replacement = CHARACTERS_TO_NORMALIZE[p1];
|
|
const jj = replacement.length;
|
|
for (let j = 1; j < jj; j++) {
|
|
positions.push([i - shift + j, shift - j]);
|
|
}
|
|
shift -= jj - 1;
|
|
return replacement;
|
|
}
|
|
|
|
if (p2) {
|
|
// Use the NFKC representation to normalize the char.
|
|
let replacement = NFKC_CHARS_TO_NORMALIZE.get(p2);
|
|
if (!replacement) {
|
|
replacement = p2.normalize("NFKC");
|
|
NFKC_CHARS_TO_NORMALIZE.set(p2, replacement);
|
|
}
|
|
const jj = replacement.length;
|
|
for (let j = 1; j < jj; j++) {
|
|
positions.push([i - shift + j, shift - j]);
|
|
}
|
|
shift -= jj - 1;
|
|
return replacement;
|
|
}
|
|
|
|
if (p3) {
|
|
// We've a Katakana-Hiragana diacritic followed by a \n so don't replace
|
|
// the \n by a whitespace.
|
|
hasDiacritics = true;
|
|
|
|
// Diacritic.
|
|
if (i + eol === rawDiacriticsPositions[rawDiacriticsIndex]?.[1]) {
|
|
++rawDiacriticsIndex;
|
|
} else {
|
|
// i is the position of the first diacritic
|
|
// so (i - 1) is the position for the letter before.
|
|
positions.push([i - 1 - shift + 1, shift - 1]);
|
|
shift -= 1;
|
|
shiftOrigin += 1;
|
|
}
|
|
|
|
// End-of-line.
|
|
positions.push([i - shift + 1, shift]);
|
|
shiftOrigin += 1;
|
|
eol += 1;
|
|
|
|
return p3.charAt(0);
|
|
}
|
|
|
|
if (p4) {
|
|
const hasTrailingDashEOL = p4.endsWith("\n");
|
|
const len = hasTrailingDashEOL ? p4.length - 2 : p4.length;
|
|
|
|
// Diacritics.
|
|
hasDiacritics = true;
|
|
let jj = len;
|
|
if (i + eol === rawDiacriticsPositions[rawDiacriticsIndex]?.[1]) {
|
|
jj -= rawDiacriticsPositions[rawDiacriticsIndex][0];
|
|
++rawDiacriticsIndex;
|
|
}
|
|
|
|
for (let j = 1; j <= jj; j++) {
|
|
// i is the position of the first diacritic
|
|
// so (i - 1) is the position for the letter before.
|
|
positions.push([i - 1 - shift + j, shift - j]);
|
|
}
|
|
shift -= jj;
|
|
shiftOrigin += jj;
|
|
|
|
if (hasTrailingDashEOL) {
|
|
// Diacritics are followed by a -\n.
|
|
// See comments in `if (p5)` block.
|
|
i += len - 1;
|
|
positions.push([i - shift + 1, 1 + shift]);
|
|
shift += 1;
|
|
shiftOrigin += 1;
|
|
eol += 1;
|
|
return p4.slice(0, len);
|
|
}
|
|
|
|
return p4;
|
|
}
|
|
|
|
if (p5) {
|
|
// "X-\n" is removed because an hyphen at the end of a line
|
|
// with not a space before is likely here to mark a break
|
|
// in a word.
|
|
// If X is encoded with UTF-32 then it can have a length greater than 1.
|
|
// The \n isn't in the original text so here y = i, n = X.len - 2 and
|
|
// o = X.len - 1.
|
|
const len = p5.length - 2;
|
|
positions.push([i - shift + len, 1 + shift]);
|
|
shift += 1;
|
|
shiftOrigin += 1;
|
|
eol += 1;
|
|
return p5.slice(0, -2);
|
|
}
|
|
|
|
if (p6) {
|
|
// An ideographic at the end of a line doesn't imply adding an extra
|
|
// white space.
|
|
// A CJK can be encoded in UTF-32, hence their length isn't always 1.
|
|
const len = p6.length - 1;
|
|
positions.push([i - shift + len, shift]);
|
|
shiftOrigin += 1;
|
|
eol += 1;
|
|
return p6.slice(0, -1);
|
|
}
|
|
|
|
if (p7) {
|
|
// eol is replaced by space: "foo\nbar" is likely equivalent to
|
|
// "foo bar".
|
|
positions.push([i - shift + 1, shift - 1]);
|
|
shift -= 1;
|
|
shiftOrigin += 1;
|
|
eol += 1;
|
|
return " ";
|
|
}
|
|
|
|
// p8
|
|
if (i + eol === syllablePositions[syllableIndex]?.[1]) {
|
|
// A syllable (1 char) is replaced with several chars (n) so
|
|
// newCharsLen = n - 1.
|
|
const newCharLen = syllablePositions[syllableIndex][0] - 1;
|
|
++syllableIndex;
|
|
for (let j = 1; j <= newCharLen; j++) {
|
|
positions.push([i - (shift - j), shift - j]);
|
|
}
|
|
shift -= newCharLen;
|
|
shiftOrigin += newCharLen;
|
|
}
|
|
return p8;
|
|
}
|
|
);
|
|
|
|
positions.push([normalized.length, shift]);
|
|
|
|
return [normalized, positions, hasDiacritics];
|
|
}
|
|
|
|
// Determine the original, non-normalized, match index such that highlighting of
|
|
// search results is correct in the `textLayer` for strings containing e.g. "½"
|
|
// characters; essentially "inverting" the result of the `normalize` function.
|
|
function getOriginalIndex(diffs, pos, len) {
|
|
if (!diffs) {
|
|
return [pos, len];
|
|
}
|
|
|
|
// First char in the new string.
|
|
const start = pos;
|
|
// Last char in the new string.
|
|
const end = pos + len - 1;
|
|
let i = binarySearchFirstItem(diffs, x => x[0] >= start);
|
|
if (diffs[i][0] > start) {
|
|
--i;
|
|
}
|
|
|
|
let j = binarySearchFirstItem(diffs, x => x[0] >= end, i);
|
|
if (diffs[j][0] > end) {
|
|
--j;
|
|
}
|
|
|
|
// First char in the old string.
|
|
const oldStart = start + diffs[i][1];
|
|
|
|
// Last char in the old string.
|
|
const oldEnd = end + diffs[j][1];
|
|
const oldLen = oldEnd + 1 - oldStart;
|
|
|
|
return [oldStart, oldLen];
|
|
}
|
|
|
|
/**
|
|
* @typedef {Object} PDFFindControllerOptions
|
|
* @property {IPDFLinkService} linkService - The navigation/linking service.
|
|
* @property {EventBus} eventBus - The application event bus.
|
|
* @property {boolean} [updateMatchesCountOnProgress] - True if the matches
|
|
* count must be updated on progress or only when the last page is reached.
|
|
* The default value is `true`.
|
|
*/
|
|
|
|
/**
|
|
* Provides search functionality to find a given string in a PDF document.
|
|
*/
|
|
class PDFFindController {
|
|
#state = null;
|
|
|
|
#updateMatchesCountOnProgress = true;
|
|
|
|
#visitedPagesCount = 0;
|
|
|
|
/**
|
|
* @param {PDFFindControllerOptions} options
|
|
*/
|
|
constructor({ linkService, eventBus, updateMatchesCountOnProgress = true }) {
|
|
this._linkService = linkService;
|
|
this._eventBus = eventBus;
|
|
this.#updateMatchesCountOnProgress = updateMatchesCountOnProgress;
|
|
|
|
this.#reset();
|
|
eventBus._on("find", this.#onFind.bind(this));
|
|
eventBus._on("findbarclose", this.#onFindBarClose.bind(this));
|
|
}
|
|
|
|
get highlightMatches() {
|
|
return this._highlightMatches;
|
|
}
|
|
|
|
get pageMatches() {
|
|
return this._pageMatches;
|
|
}
|
|
|
|
get pageMatchesLength() {
|
|
return this._pageMatchesLength;
|
|
}
|
|
|
|
get selected() {
|
|
return this._selected;
|
|
}
|
|
|
|
get state() {
|
|
return this.#state;
|
|
}
|
|
|
|
/**
|
|
* Set a reference to the PDF document in order to search it.
|
|
* Note that searching is not possible if this method is not called.
|
|
*
|
|
* @param {PDFDocumentProxy} pdfDocument - The PDF document to search.
|
|
*/
|
|
setDocument(pdfDocument) {
|
|
if (this._pdfDocument) {
|
|
this.#reset();
|
|
}
|
|
if (!pdfDocument) {
|
|
return;
|
|
}
|
|
this._pdfDocument = pdfDocument;
|
|
this._firstPageCapability.resolve();
|
|
}
|
|
|
|
#onFind(state) {
|
|
if (!state) {
|
|
return;
|
|
}
|
|
if (
|
|
(typeof PDFJSDev === "undefined" || PDFJSDev.test("GENERIC")) &&
|
|
state.phraseSearch === false
|
|
) {
|
|
console.error(
|
|
"The `phraseSearch`-parameter was removed, please provide " +
|
|
"an Array of strings in the `query`-parameter instead."
|
|
);
|
|
if (typeof state.query === "string") {
|
|
state.query = state.query.match(/\S+/g);
|
|
}
|
|
}
|
|
const pdfDocument = this._pdfDocument;
|
|
const { type } = state;
|
|
|
|
if (this.#state === null || this.#shouldDirtyMatch(state)) {
|
|
this._dirtyMatch = true;
|
|
}
|
|
this.#state = state;
|
|
if (type !== "highlightallchange") {
|
|
this.#updateUIState(FindState.PENDING);
|
|
}
|
|
|
|
this._firstPageCapability.promise.then(() => {
|
|
// If the document was closed before searching began, or if the search
|
|
// operation was relevant for a previously opened document, do nothing.
|
|
if (
|
|
!this._pdfDocument ||
|
|
(pdfDocument && this._pdfDocument !== pdfDocument)
|
|
) {
|
|
return;
|
|
}
|
|
this.#extractText();
|
|
|
|
const findbarClosed = !this._highlightMatches;
|
|
const pendingTimeout = !!this._findTimeout;
|
|
|
|
if (this._findTimeout) {
|
|
clearTimeout(this._findTimeout);
|
|
this._findTimeout = null;
|
|
}
|
|
if (!type) {
|
|
// Trigger the find action with a small delay to avoid starting the
|
|
// search when the user is still typing (saving resources).
|
|
this._findTimeout = setTimeout(() => {
|
|
this.#nextMatch();
|
|
this._findTimeout = null;
|
|
}, FIND_TIMEOUT);
|
|
} else if (this._dirtyMatch) {
|
|
// Immediately trigger searching for non-'find' operations, when the
|
|
// current state needs to be reset and matches re-calculated.
|
|
this.#nextMatch();
|
|
} else if (type === "again") {
|
|
this.#nextMatch();
|
|
|
|
// When the findbar was previously closed, and `highlightAll` is set,
|
|
// ensure that the matches on all active pages are highlighted again.
|
|
if (findbarClosed && this.#state.highlightAll) {
|
|
this.#updateAllPages();
|
|
}
|
|
} else if (type === "highlightallchange") {
|
|
// If there was a pending search operation, synchronously trigger a new
|
|
// search *first* to ensure that the correct matches are highlighted.
|
|
if (pendingTimeout) {
|
|
this.#nextMatch();
|
|
} else {
|
|
this._highlightMatches = true;
|
|
}
|
|
this.#updateAllPages(); // Update the highlighting on all active pages.
|
|
} else {
|
|
this.#nextMatch();
|
|
}
|
|
});
|
|
}
|
|
|
|
scrollMatchIntoView({
|
|
element = null,
|
|
selectedLeft = 0,
|
|
pageIndex = -1,
|
|
matchIndex = -1,
|
|
}) {
|
|
if (!this._scrollMatches || !element) {
|
|
return;
|
|
} else if (matchIndex === -1 || matchIndex !== this._selected.matchIdx) {
|
|
return;
|
|
} else if (pageIndex === -1 || pageIndex !== this._selected.pageIdx) {
|
|
return;
|
|
}
|
|
this._scrollMatches = false; // Ensure that scrolling only happens once.
|
|
|
|
const spot = {
|
|
top: MATCH_SCROLL_OFFSET_TOP,
|
|
left: selectedLeft + MATCH_SCROLL_OFFSET_LEFT,
|
|
};
|
|
scrollIntoView(element, spot, /* scrollMatches = */ true);
|
|
}
|
|
|
|
#reset() {
|
|
this._highlightMatches = false;
|
|
this._scrollMatches = false;
|
|
this._pdfDocument = null;
|
|
this._pageMatches = [];
|
|
this._pageMatchesLength = [];
|
|
this.#visitedPagesCount = 0;
|
|
this.#state = null;
|
|
// Currently selected match.
|
|
this._selected = {
|
|
pageIdx: -1,
|
|
matchIdx: -1,
|
|
};
|
|
// Where the find algorithm currently is in the document.
|
|
this._offset = {
|
|
pageIdx: null,
|
|
matchIdx: null,
|
|
wrapped: false,
|
|
};
|
|
this._extractTextPromises = [];
|
|
this._pageContents = []; // Stores the normalized text for each page.
|
|
this._pageDiffs = [];
|
|
this._hasDiacritics = [];
|
|
this._matchesCountTotal = 0;
|
|
this._pagesToSearch = null;
|
|
this._pendingFindMatches = new Set();
|
|
this._resumePageIdx = null;
|
|
this._dirtyMatch = false;
|
|
clearTimeout(this._findTimeout);
|
|
this._findTimeout = null;
|
|
|
|
this._firstPageCapability = createPromiseCapability();
|
|
}
|
|
|
|
/**
|
|
* @type {string|Array} The (current) normalized search query.
|
|
*/
|
|
get #query() {
|
|
const { query } = this.#state;
|
|
if (typeof query === "string") {
|
|
if (query !== this._rawQuery) {
|
|
this._rawQuery = query;
|
|
[this._normalizedQuery] = normalize(query);
|
|
}
|
|
return this._normalizedQuery;
|
|
}
|
|
// We don't bother caching the normalized search query in the Array-case,
|
|
// since this code-path is *essentially* unused in the default viewer.
|
|
return (query || []).filter(q => !!q).map(q => normalize(q)[0]);
|
|
}
|
|
|
|
#shouldDirtyMatch(state) {
|
|
// When the search query changes, regardless of the actual search command
|
|
// used, always re-calculate matches to avoid errors (fixes bug 1030622).
|
|
const newQuery = state.query,
|
|
prevQuery = this.#state.query;
|
|
const newType = typeof newQuery,
|
|
prevType = typeof prevQuery;
|
|
|
|
if (newType !== prevType) {
|
|
return true;
|
|
}
|
|
if (newType === "string") {
|
|
if (newQuery !== prevQuery) {
|
|
return true;
|
|
}
|
|
} else {
|
|
// Array
|
|
if (JSON.stringify(newQuery) !== JSON.stringify(prevQuery)) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
switch (state.type) {
|
|
case "again":
|
|
const pageNumber = this._selected.pageIdx + 1;
|
|
const linkService = this._linkService;
|
|
// Only treat a 'findagain' event as a new search operation when it's
|
|
// *absolutely* certain that the currently selected match is no longer
|
|
// visible, e.g. as a result of the user scrolling in the document.
|
|
//
|
|
// NOTE: If only a simple `this._linkService.page` check was used here,
|
|
// there's a risk that consecutive 'findagain' operations could "skip"
|
|
// over matches at the top/bottom of pages thus making them completely
|
|
// inaccessible when there's multiple pages visible in the viewer.
|
|
if (
|
|
pageNumber >= 1 &&
|
|
pageNumber <= linkService.pagesCount &&
|
|
pageNumber !== linkService.page &&
|
|
!linkService.isPageVisible(pageNumber)
|
|
) {
|
|
return true;
|
|
}
|
|
return false;
|
|
case "highlightallchange":
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* Determine if the search query constitutes a "whole word", by comparing the
|
|
* first/last character type with the preceding/following character type.
|
|
*/
|
|
#isEntireWord(content, startIdx, length) {
|
|
let match = content
|
|
.slice(0, startIdx)
|
|
.match(NOT_DIACRITIC_FROM_END_REG_EXP);
|
|
if (match) {
|
|
const first = content.charCodeAt(startIdx);
|
|
const limit = match[1].charCodeAt(0);
|
|
if (getCharacterType(first) === getCharacterType(limit)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
match = content
|
|
.slice(startIdx + length)
|
|
.match(NOT_DIACRITIC_FROM_START_REG_EXP);
|
|
if (match) {
|
|
const last = content.charCodeAt(startIdx + length - 1);
|
|
const limit = match[1].charCodeAt(0);
|
|
if (getCharacterType(last) === getCharacterType(limit)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
#calculateRegExpMatch(query, entireWord, pageIndex, pageContent) {
|
|
const matches = (this._pageMatches[pageIndex] = []);
|
|
const matchesLength = (this._pageMatchesLength[pageIndex] = []);
|
|
if (!query) {
|
|
// The query can be empty because some chars like diacritics could have
|
|
// been stripped out.
|
|
return;
|
|
}
|
|
const diffs = this._pageDiffs[pageIndex];
|
|
let match;
|
|
while ((match = query.exec(pageContent)) !== null) {
|
|
if (
|
|
entireWord &&
|
|
!this.#isEntireWord(pageContent, match.index, match[0].length)
|
|
) {
|
|
continue;
|
|
}
|
|
|
|
const [matchPos, matchLen] = getOriginalIndex(
|
|
diffs,
|
|
match.index,
|
|
match[0].length
|
|
);
|
|
|
|
if (matchLen) {
|
|
matches.push(matchPos);
|
|
matchesLength.push(matchLen);
|
|
}
|
|
}
|
|
}
|
|
|
|
#convertToRegExpString(query, hasDiacritics) {
|
|
const { matchDiacritics } = this.#state;
|
|
let isUnicode = false;
|
|
query = query.replaceAll(
|
|
SPECIAL_CHARS_REG_EXP,
|
|
(
|
|
match,
|
|
p1 /* to escape */,
|
|
p2 /* punctuation */,
|
|
p3 /* whitespaces */,
|
|
p4 /* diacritics */,
|
|
p5 /* letters */
|
|
) => {
|
|
// We don't need to use a \s for whitespaces since all the different
|
|
// kind of whitespaces are replaced by a single " ".
|
|
|
|
if (p1) {
|
|
// Escape characters like *+?... to not interfer with regexp syntax.
|
|
return `[ ]*\\${p1}[ ]*`;
|
|
}
|
|
if (p2) {
|
|
// Allow whitespaces around punctuation signs.
|
|
return `[ ]*${p2}[ ]*`;
|
|
}
|
|
if (p3) {
|
|
// Replace spaces by \s+ to be sure to match any spaces.
|
|
return "[ ]+";
|
|
}
|
|
if (matchDiacritics) {
|
|
return p4 || p5;
|
|
}
|
|
|
|
if (p4) {
|
|
// Diacritics are removed with few exceptions.
|
|
return DIACRITICS_EXCEPTION.has(p4.charCodeAt(0)) ? p4 : "";
|
|
}
|
|
|
|
// A letter has been matched and it can be followed by any diacritics
|
|
// in normalized text.
|
|
if (hasDiacritics) {
|
|
isUnicode = true;
|
|
return `${p5}\\p{M}*`;
|
|
}
|
|
return p5;
|
|
}
|
|
);
|
|
|
|
const trailingSpaces = "[ ]*";
|
|
if (query.endsWith(trailingSpaces)) {
|
|
// The [ ]* has been added in order to help to match "foo . bar" but
|
|
// it doesn't make sense to match some whitespaces after the dot
|
|
// when it's the last character.
|
|
query = query.slice(0, query.length - trailingSpaces.length);
|
|
}
|
|
|
|
if (matchDiacritics) {
|
|
// aX must not match aXY.
|
|
if (hasDiacritics) {
|
|
DIACRITICS_EXCEPTION_STR ||= String.fromCharCode(
|
|
...DIACRITICS_EXCEPTION
|
|
);
|
|
|
|
isUnicode = true;
|
|
query = `${query}(?=[${DIACRITICS_EXCEPTION_STR}]|[^\\p{M}]|$)`;
|
|
}
|
|
}
|
|
|
|
return [isUnicode, query];
|
|
}
|
|
|
|
#calculateMatch(pageIndex) {
|
|
let query = this.#query;
|
|
if (query.length === 0) {
|
|
return; // Do nothing: the matches should be wiped out already.
|
|
}
|
|
const { caseSensitive, entireWord } = this.#state;
|
|
const pageContent = this._pageContents[pageIndex];
|
|
const hasDiacritics = this._hasDiacritics[pageIndex];
|
|
|
|
let isUnicode = false;
|
|
if (typeof query === "string") {
|
|
[isUnicode, query] = this.#convertToRegExpString(query, hasDiacritics);
|
|
} else {
|
|
// Words are sorted in reverse order to be sure that "foobar" is matched
|
|
// before "foo" in case the query is "foobar foo".
|
|
query = query
|
|
.sort()
|
|
.reverse()
|
|
.map(q => {
|
|
const [isUnicodePart, queryPart] = this.#convertToRegExpString(
|
|
q,
|
|
hasDiacritics
|
|
);
|
|
isUnicode ||= isUnicodePart;
|
|
return `(${queryPart})`;
|
|
})
|
|
.join("|");
|
|
}
|
|
|
|
const flags = `g${isUnicode ? "u" : ""}${caseSensitive ? "" : "i"}`;
|
|
query = query ? new RegExp(query, flags) : null;
|
|
|
|
this.#calculateRegExpMatch(query, entireWord, pageIndex, pageContent);
|
|
|
|
// When `highlightAll` is set, ensure that the matches on previously
|
|
// rendered (and still active) pages are correctly highlighted.
|
|
if (this.#state.highlightAll) {
|
|
this.#updatePage(pageIndex);
|
|
}
|
|
if (this._resumePageIdx === pageIndex) {
|
|
this._resumePageIdx = null;
|
|
this.#nextPageMatch();
|
|
}
|
|
|
|
// Update the match count.
|
|
const pageMatchesCount = this._pageMatches[pageIndex].length;
|
|
this._matchesCountTotal += pageMatchesCount;
|
|
if (this.#updateMatchesCountOnProgress) {
|
|
if (pageMatchesCount > 0) {
|
|
this.#updateUIResultsCount();
|
|
}
|
|
} else if (++this.#visitedPagesCount === this._linkService.pagesCount) {
|
|
// For example, in GeckoView we want to have only the final update because
|
|
// the Java side provides only one object to update the counts.
|
|
this.#updateUIResultsCount();
|
|
}
|
|
}
|
|
|
|
#extractText() {
|
|
// Perform text extraction once if this method is called multiple times.
|
|
if (this._extractTextPromises.length > 0) {
|
|
return;
|
|
}
|
|
|
|
let promise = Promise.resolve();
|
|
for (let i = 0, ii = this._linkService.pagesCount; i < ii; i++) {
|
|
const extractTextCapability = createPromiseCapability();
|
|
this._extractTextPromises[i] = extractTextCapability.promise;
|
|
|
|
promise = promise.then(() => {
|
|
return this._pdfDocument
|
|
.getPage(i + 1)
|
|
.then(pdfPage => {
|
|
return pdfPage.getTextContent();
|
|
})
|
|
.then(
|
|
textContent => {
|
|
const strBuf = [];
|
|
|
|
for (const textItem of textContent.items) {
|
|
strBuf.push(textItem.str);
|
|
if (textItem.hasEOL) {
|
|
strBuf.push("\n");
|
|
}
|
|
}
|
|
|
|
// Store the normalized page content (text items) as one string.
|
|
[
|
|
this._pageContents[i],
|
|
this._pageDiffs[i],
|
|
this._hasDiacritics[i],
|
|
] = normalize(strBuf.join(""));
|
|
extractTextCapability.resolve();
|
|
},
|
|
reason => {
|
|
console.error(
|
|
`Unable to get text content for page ${i + 1}`,
|
|
reason
|
|
);
|
|
// Page error -- assuming no text content.
|
|
this._pageContents[i] = "";
|
|
this._pageDiffs[i] = null;
|
|
this._hasDiacritics[i] = false;
|
|
extractTextCapability.resolve();
|
|
}
|
|
);
|
|
});
|
|
}
|
|
}
|
|
|
|
#updatePage(index) {
|
|
if (this._scrollMatches && this._selected.pageIdx === index) {
|
|
// If the page is selected, scroll the page into view, which triggers
|
|
// rendering the page, which adds the text layer. Once the text layer
|
|
// is built, it will attempt to scroll the selected match into view.
|
|
this._linkService.page = index + 1;
|
|
}
|
|
|
|
this._eventBus.dispatch("updatetextlayermatches", {
|
|
source: this,
|
|
pageIndex: index,
|
|
});
|
|
}
|
|
|
|
#updateAllPages() {
|
|
this._eventBus.dispatch("updatetextlayermatches", {
|
|
source: this,
|
|
pageIndex: -1,
|
|
});
|
|
}
|
|
|
|
#nextMatch() {
|
|
const previous = this.#state.findPrevious;
|
|
const currentPageIndex = this._linkService.page - 1;
|
|
const numPages = this._linkService.pagesCount;
|
|
|
|
this._highlightMatches = true;
|
|
|
|
if (this._dirtyMatch) {
|
|
// Need to recalculate the matches, reset everything.
|
|
this._dirtyMatch = false;
|
|
this._selected.pageIdx = this._selected.matchIdx = -1;
|
|
this._offset.pageIdx = currentPageIndex;
|
|
this._offset.matchIdx = null;
|
|
this._offset.wrapped = false;
|
|
this._resumePageIdx = null;
|
|
this._pageMatches.length = 0;
|
|
this._pageMatchesLength.length = 0;
|
|
this.#visitedPagesCount = 0;
|
|
this._matchesCountTotal = 0;
|
|
|
|
this.#updateAllPages(); // Wipe out any previously highlighted matches.
|
|
|
|
for (let i = 0; i < numPages; i++) {
|
|
// Start finding the matches as soon as the text is extracted.
|
|
if (this._pendingFindMatches.has(i)) {
|
|
continue;
|
|
}
|
|
this._pendingFindMatches.add(i);
|
|
this._extractTextPromises[i].then(() => {
|
|
this._pendingFindMatches.delete(i);
|
|
this.#calculateMatch(i);
|
|
});
|
|
}
|
|
}
|
|
|
|
// If there's no query there's no point in searching.
|
|
const query = this.#query;
|
|
if (query.length === 0) {
|
|
this.#updateUIState(FindState.FOUND);
|
|
return;
|
|
}
|
|
// If we're waiting on a page, we return since we can't do anything else.
|
|
if (this._resumePageIdx) {
|
|
return;
|
|
}
|
|
|
|
const offset = this._offset;
|
|
// Keep track of how many pages we should maximally iterate through.
|
|
this._pagesToSearch = numPages;
|
|
// If there's already a `matchIdx` that means we are iterating through a
|
|
// page's matches.
|
|
if (offset.matchIdx !== null) {
|
|
const numPageMatches = this._pageMatches[offset.pageIdx].length;
|
|
if (
|
|
(!previous && offset.matchIdx + 1 < numPageMatches) ||
|
|
(previous && offset.matchIdx > 0)
|
|
) {
|
|
// The simple case; we just have advance the matchIdx to select
|
|
// the next match on the page.
|
|
offset.matchIdx = previous ? offset.matchIdx - 1 : offset.matchIdx + 1;
|
|
this.#updateMatch(/* found = */ true);
|
|
return;
|
|
}
|
|
// We went beyond the current page's matches, so we advance to
|
|
// the next page.
|
|
this.#advanceOffsetPage(previous);
|
|
}
|
|
// Start searching through the page.
|
|
this.#nextPageMatch();
|
|
}
|
|
|
|
#matchesReady(matches) {
|
|
const offset = this._offset;
|
|
const numMatches = matches.length;
|
|
const previous = this.#state.findPrevious;
|
|
|
|
if (numMatches) {
|
|
// There were matches for the page, so initialize `matchIdx`.
|
|
offset.matchIdx = previous ? numMatches - 1 : 0;
|
|
this.#updateMatch(/* found = */ true);
|
|
return true;
|
|
}
|
|
// No matches, so attempt to search the next page.
|
|
this.#advanceOffsetPage(previous);
|
|
if (offset.wrapped) {
|
|
offset.matchIdx = null;
|
|
if (this._pagesToSearch < 0) {
|
|
// No point in wrapping again, there were no matches.
|
|
this.#updateMatch(/* found = */ false);
|
|
// While matches were not found, searching for a page
|
|
// with matches should nevertheless halt.
|
|
return true;
|
|
}
|
|
}
|
|
// Matches were not found (and searching is not done).
|
|
return false;
|
|
}
|
|
|
|
#nextPageMatch() {
|
|
if (this._resumePageIdx !== null) {
|
|
console.error("There can only be one pending page.");
|
|
}
|
|
|
|
let matches = null;
|
|
do {
|
|
const pageIdx = this._offset.pageIdx;
|
|
matches = this._pageMatches[pageIdx];
|
|
if (!matches) {
|
|
// The matches don't exist yet for processing by `_matchesReady`,
|
|
// so set a resume point for when they do exist.
|
|
this._resumePageIdx = pageIdx;
|
|
break;
|
|
}
|
|
} while (!this.#matchesReady(matches));
|
|
}
|
|
|
|
#advanceOffsetPage(previous) {
|
|
const offset = this._offset;
|
|
const numPages = this._linkService.pagesCount;
|
|
offset.pageIdx = previous ? offset.pageIdx - 1 : offset.pageIdx + 1;
|
|
offset.matchIdx = null;
|
|
|
|
this._pagesToSearch--;
|
|
|
|
if (offset.pageIdx >= numPages || offset.pageIdx < 0) {
|
|
offset.pageIdx = previous ? numPages - 1 : 0;
|
|
offset.wrapped = true;
|
|
}
|
|
}
|
|
|
|
#updateMatch(found = false) {
|
|
let state = FindState.NOT_FOUND;
|
|
const wrapped = this._offset.wrapped;
|
|
this._offset.wrapped = false;
|
|
|
|
if (found) {
|
|
const previousPage = this._selected.pageIdx;
|
|
this._selected.pageIdx = this._offset.pageIdx;
|
|
this._selected.matchIdx = this._offset.matchIdx;
|
|
state = wrapped ? FindState.WRAPPED : FindState.FOUND;
|
|
|
|
// Update the currently selected page to wipe out any selected matches.
|
|
if (previousPage !== -1 && previousPage !== this._selected.pageIdx) {
|
|
this.#updatePage(previousPage);
|
|
}
|
|
}
|
|
|
|
this.#updateUIState(state, this.#state.findPrevious);
|
|
if (this._selected.pageIdx !== -1) {
|
|
// Ensure that the match will be scrolled into view.
|
|
this._scrollMatches = true;
|
|
|
|
this.#updatePage(this._selected.pageIdx);
|
|
}
|
|
}
|
|
|
|
#onFindBarClose(evt) {
|
|
const pdfDocument = this._pdfDocument;
|
|
// Since searching is asynchronous, ensure that the removal of highlighted
|
|
// matches (from the UI) is async too such that the 'updatetextlayermatches'
|
|
// events will always be dispatched in the expected order.
|
|
this._firstPageCapability.promise.then(() => {
|
|
// Only update the UI if the document is open, and is the current one.
|
|
if (
|
|
!this._pdfDocument ||
|
|
(pdfDocument && this._pdfDocument !== pdfDocument)
|
|
) {
|
|
return;
|
|
}
|
|
// Ensure that a pending, not yet started, search operation is aborted.
|
|
if (this._findTimeout) {
|
|
clearTimeout(this._findTimeout);
|
|
this._findTimeout = null;
|
|
}
|
|
// Abort any long running searches, to avoid a match being scrolled into
|
|
// view *after* the findbar has been closed. In this case `this._offset`
|
|
// will most likely differ from `this._selected`, hence we also ensure
|
|
// that any new search operation will always start with a clean slate.
|
|
if (this._resumePageIdx) {
|
|
this._resumePageIdx = null;
|
|
this._dirtyMatch = true;
|
|
}
|
|
// Avoid the UI being in a pending state when the findbar is re-opened.
|
|
this.#updateUIState(FindState.FOUND);
|
|
|
|
this._highlightMatches = false;
|
|
this.#updateAllPages(); // Wipe out any previously highlighted matches.
|
|
});
|
|
}
|
|
|
|
#requestMatchesCount() {
|
|
const { pageIdx, matchIdx } = this._selected;
|
|
let current = 0,
|
|
total = this._matchesCountTotal;
|
|
if (matchIdx !== -1) {
|
|
for (let i = 0; i < pageIdx; i++) {
|
|
current += this._pageMatches[i]?.length || 0;
|
|
}
|
|
current += matchIdx + 1;
|
|
}
|
|
// When searching starts, this method may be called before the `pageMatches`
|
|
// have been counted (in `_calculateMatch`). Ensure that the UI won't show
|
|
// temporarily broken state when the active find result doesn't make sense.
|
|
if (current < 1 || current > total) {
|
|
current = total = 0;
|
|
}
|
|
return { current, total };
|
|
}
|
|
|
|
#updateUIResultsCount() {
|
|
this._eventBus.dispatch("updatefindmatchescount", {
|
|
source: this,
|
|
matchesCount: this.#requestMatchesCount(),
|
|
});
|
|
}
|
|
|
|
#updateUIState(state, previous = false) {
|
|
if (
|
|
!this.#updateMatchesCountOnProgress &&
|
|
(this.#visitedPagesCount !== this._linkService.pagesCount ||
|
|
state === FindState.PENDING)
|
|
) {
|
|
// When this.#updateMatchesCountOnProgress is false we only send an update
|
|
// when everything is ready.
|
|
return;
|
|
}
|
|
|
|
this._eventBus.dispatch("updatefindcontrolstate", {
|
|
source: this,
|
|
state,
|
|
previous,
|
|
matchesCount: this.#requestMatchesCount(),
|
|
rawQuery: this.#state?.query ?? null,
|
|
});
|
|
}
|
|
}
|
|
|
|
export { FindState, PDFFindController };
|