pdf.js/web/pdf_find_controller.js
Jonas Jenwald 36881e3770 Ensure that all import and require statements, in the entire code-base, have a .js file extension
In order to eventually get rid of SystemJS and start using native `import`s instead, we'll need to provide "complete" file identifiers since otherwise there'll be MIME type errors when attempting to use `import`.
2020-01-04 13:01:43 +01:00

744 lines
23 KiB
JavaScript

/* Copyright 2012 Mozilla Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { getGlobalEventBus, scrollIntoView } from "./ui_utils.js";
import { createPromiseCapability } from "pdfjs-lib";
import { getCharacterType } from "./pdf_find_utils.js";
const FindState = {
FOUND: 0,
NOT_FOUND: 1,
WRAPPED: 2,
PENDING: 3,
};
const FIND_TIMEOUT = 250; // ms
const MATCH_SCROLL_OFFSET_TOP = -50; // px
const MATCH_SCROLL_OFFSET_LEFT = -400; // px
const CHARACTERS_TO_NORMALIZE = {
"\u2018": "'", // Left single quotation mark
"\u2019": "'", // Right single quotation mark
"\u201A": "'", // Single low-9 quotation mark
"\u201B": "'", // Single high-reversed-9 quotation mark
"\u201C": '"', // Left double quotation mark
"\u201D": '"', // Right double quotation mark
"\u201E": '"', // Double low-9 quotation mark
"\u201F": '"', // Double high-reversed-9 quotation mark
"\u00BC": "1/4", // Vulgar fraction one quarter
"\u00BD": "1/2", // Vulgar fraction one half
"\u00BE": "3/4", // Vulgar fraction three quarters
};
let normalizationRegex = null;
function normalize(text) {
if (!normalizationRegex) {
// Compile the regular expression for text normalization once.
const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join("");
normalizationRegex = new RegExp(`[${replace}]`, "g");
}
return text.replace(normalizationRegex, function(ch) {
return CHARACTERS_TO_NORMALIZE[ch];
});
}
/**
* @typedef {Object} PDFFindControllerOptions
* @property {IPDFLinkService} linkService - The navigation/linking service.
* @property {EventBus} eventBus - The application event bus.
*/
/**
* Provides search functionality to find a given string in a PDF document.
*/
class PDFFindController {
/**
* @param {PDFFindControllerOptions} options
*/
constructor({ linkService, eventBus = getGlobalEventBus() }) {
this._linkService = linkService;
this._eventBus = eventBus;
this._reset();
eventBus.on("findbarclose", this._onFindBarClose.bind(this));
}
get highlightMatches() {
return this._highlightMatches;
}
get pageMatches() {
return this._pageMatches;
}
get pageMatchesLength() {
return this._pageMatchesLength;
}
get selected() {
return this._selected;
}
get state() {
return this._state;
}
/**
* Set a reference to the PDF document in order to search it.
* Note that searching is not possible if this method is not called.
*
* @param {PDFDocumentProxy} pdfDocument - The PDF document to search.
*/
setDocument(pdfDocument) {
if (this._pdfDocument) {
this._reset();
}
if (!pdfDocument) {
return;
}
this._pdfDocument = pdfDocument;
this._firstPageCapability.resolve();
}
executeCommand(cmd, state) {
if (!state) {
return;
}
const pdfDocument = this._pdfDocument;
if (this._state === null || this._shouldDirtyMatch(cmd, state)) {
this._dirtyMatch = true;
}
this._state = state;
if (cmd !== "findhighlightallchange") {
this._updateUIState(FindState.PENDING);
}
this._firstPageCapability.promise.then(() => {
// If the document was closed before searching began, or if the search
// operation was relevant for a previously opened document, do nothing.
if (
!this._pdfDocument ||
(pdfDocument && this._pdfDocument !== pdfDocument)
) {
return;
}
this._extractText();
const findbarClosed = !this._highlightMatches;
const pendingTimeout = !!this._findTimeout;
if (this._findTimeout) {
clearTimeout(this._findTimeout);
this._findTimeout = null;
}
if (cmd === "find") {
// Trigger the find action with a small delay to avoid starting the
// search when the user is still typing (saving resources).
this._findTimeout = setTimeout(() => {
this._nextMatch();
this._findTimeout = null;
}, FIND_TIMEOUT);
} else if (this._dirtyMatch) {
// Immediately trigger searching for non-'find' operations, when the
// current state needs to be reset and matches re-calculated.
this._nextMatch();
} else if (cmd === "findagain") {
this._nextMatch();
// When the findbar was previously closed, and `highlightAll` is set,
// ensure that the matches on all active pages are highlighted again.
if (findbarClosed && this._state.highlightAll) {
this._updateAllPages();
}
} else if (cmd === "findhighlightallchange") {
// If there was a pending search operation, synchronously trigger a new
// search *first* to ensure that the correct matches are highlighted.
if (pendingTimeout) {
this._nextMatch();
} else {
this._highlightMatches = true;
}
this._updateAllPages(); // Update the highlighting on all active pages.
} else {
this._nextMatch();
}
});
}
scrollMatchIntoView({ element = null, pageIndex = -1, matchIndex = -1 }) {
if (!this._scrollMatches || !element) {
return;
} else if (matchIndex === -1 || matchIndex !== this._selected.matchIdx) {
return;
} else if (pageIndex === -1 || pageIndex !== this._selected.pageIdx) {
return;
}
this._scrollMatches = false; // Ensure that scrolling only happens once.
const spot = {
top: MATCH_SCROLL_OFFSET_TOP,
left: MATCH_SCROLL_OFFSET_LEFT,
};
scrollIntoView(element, spot, /* skipOverflowHiddenElements = */ true);
}
_reset() {
this._highlightMatches = false;
this._scrollMatches = false;
this._pdfDocument = null;
this._pageMatches = [];
this._pageMatchesLength = [];
this._state = null;
// Currently selected match.
this._selected = {
pageIdx: -1,
matchIdx: -1,
};
// Where the find algorithm currently is in the document.
this._offset = {
pageIdx: null,
matchIdx: null,
wrapped: false,
};
this._extractTextPromises = [];
this._pageContents = []; // Stores the normalized text for each page.
this._matchesCountTotal = 0;
this._pagesToSearch = null;
this._pendingFindMatches = Object.create(null);
this._resumePageIdx = null;
this._dirtyMatch = false;
clearTimeout(this._findTimeout);
this._findTimeout = null;
this._firstPageCapability = createPromiseCapability();
}
/**
* @type {string} The (current) normalized search query.
*/
get _query() {
if (this._state.query !== this._rawQuery) {
this._rawQuery = this._state.query;
this._normalizedQuery = normalize(this._state.query);
}
return this._normalizedQuery;
}
_shouldDirtyMatch(cmd, state) {
// When the search query changes, regardless of the actual search command
// used, always re-calculate matches to avoid errors (fixes bug 1030622).
if (state.query !== this._state.query) {
return true;
}
switch (cmd) {
case "findagain":
const pageNumber = this._selected.pageIdx + 1;
const linkService = this._linkService;
// Only treat a 'findagain' event as a new search operation when it's
// *absolutely* certain that the currently selected match is no longer
// visible, e.g. as a result of the user scrolling in the document.
//
// NOTE: If only a simple `this._linkService.page` check was used here,
// there's a risk that consecutive 'findagain' operations could "skip"
// over matches at the top/bottom of pages thus making them completely
// inaccessible when there's multiple pages visible in the viewer.
if (
pageNumber >= 1 &&
pageNumber <= linkService.pagesCount &&
pageNumber !== linkService.page &&
!linkService.isPageVisible(pageNumber)
) {
return true;
}
return false;
case "findhighlightallchange":
return false;
}
return true;
}
/**
* Helper for multi-term search that fills the `matchesWithLength` array
* and handles cases where one search term includes another search term (for
* example, "tamed tame" or "this is"). It looks for intersecting terms in
* the `matches` and keeps elements with a longer match length.
*/
_prepareMatches(matchesWithLength, matches, matchesLength) {
function isSubTerm(matchesWithLength, currentIndex) {
const currentElem = matchesWithLength[currentIndex];
const nextElem = matchesWithLength[currentIndex + 1];
// Check for cases like "TAMEd TAME".
if (
currentIndex < matchesWithLength.length - 1 &&
currentElem.match === nextElem.match
) {
currentElem.skipped = true;
return true;
}
// Check for cases like "thIS IS".
for (let i = currentIndex - 1; i >= 0; i--) {
const prevElem = matchesWithLength[i];
if (prevElem.skipped) {
continue;
}
if (prevElem.match + prevElem.matchLength < currentElem.match) {
break;
}
if (
prevElem.match + prevElem.matchLength >=
currentElem.match + currentElem.matchLength
) {
currentElem.skipped = true;
return true;
}
}
return false;
}
// Sort the array of `{ match: <match>, matchLength: <matchLength> }`
// objects on increasing index first and on the length otherwise.
matchesWithLength.sort(function(a, b) {
return a.match === b.match
? a.matchLength - b.matchLength
: a.match - b.match;
});
for (let i = 0, len = matchesWithLength.length; i < len; i++) {
if (isSubTerm(matchesWithLength, i)) {
continue;
}
matches.push(matchesWithLength[i].match);
matchesLength.push(matchesWithLength[i].matchLength);
}
}
/**
* Determine if the search query constitutes a "whole word", by comparing the
* first/last character type with the preceding/following character type.
*/
_isEntireWord(content, startIdx, length) {
if (startIdx > 0) {
const first = content.charCodeAt(startIdx);
const limit = content.charCodeAt(startIdx - 1);
if (getCharacterType(first) === getCharacterType(limit)) {
return false;
}
}
const endIdx = startIdx + length - 1;
if (endIdx < content.length - 1) {
const last = content.charCodeAt(endIdx);
const limit = content.charCodeAt(endIdx + 1);
if (getCharacterType(last) === getCharacterType(limit)) {
return false;
}
}
return true;
}
_calculatePhraseMatch(query, pageIndex, pageContent, entireWord) {
const matches = [];
const queryLen = query.length;
let matchIdx = -queryLen;
while (true) {
matchIdx = pageContent.indexOf(query, matchIdx + queryLen);
if (matchIdx === -1) {
break;
}
if (entireWord && !this._isEntireWord(pageContent, matchIdx, queryLen)) {
continue;
}
matches.push(matchIdx);
}
this._pageMatches[pageIndex] = matches;
}
_calculateWordMatch(query, pageIndex, pageContent, entireWord) {
const matchesWithLength = [];
// Divide the query into pieces and search for text in each piece.
const queryArray = query.match(/\S+/g);
for (let i = 0, len = queryArray.length; i < len; i++) {
const subquery = queryArray[i];
const subqueryLen = subquery.length;
let matchIdx = -subqueryLen;
while (true) {
matchIdx = pageContent.indexOf(subquery, matchIdx + subqueryLen);
if (matchIdx === -1) {
break;
}
if (
entireWord &&
!this._isEntireWord(pageContent, matchIdx, subqueryLen)
) {
continue;
}
// Other searches do not, so we store the length.
matchesWithLength.push({
match: matchIdx,
matchLength: subqueryLen,
skipped: false,
});
}
}
// Prepare arrays for storing the matches.
this._pageMatchesLength[pageIndex] = [];
this._pageMatches[pageIndex] = [];
// Sort `matchesWithLength`, remove intersecting terms and put the result
// into the two arrays.
this._prepareMatches(
matchesWithLength,
this._pageMatches[pageIndex],
this._pageMatchesLength[pageIndex]
);
}
_calculateMatch(pageIndex) {
let pageContent = this._pageContents[pageIndex];
let query = this._query;
const { caseSensitive, entireWord, phraseSearch } = this._state;
if (query.length === 0) {
// Do nothing: the matches should be wiped out already.
return;
}
if (!caseSensitive) {
pageContent = pageContent.toLowerCase();
query = query.toLowerCase();
}
if (phraseSearch) {
this._calculatePhraseMatch(query, pageIndex, pageContent, entireWord);
} else {
this._calculateWordMatch(query, pageIndex, pageContent, entireWord);
}
// When `highlightAll` is set, ensure that the matches on previously
// rendered (and still active) pages are correctly highlighted.
if (this._state.highlightAll) {
this._updatePage(pageIndex);
}
if (this._resumePageIdx === pageIndex) {
this._resumePageIdx = null;
this._nextPageMatch();
}
// Update the match count.
const pageMatchesCount = this._pageMatches[pageIndex].length;
if (pageMatchesCount > 0) {
this._matchesCountTotal += pageMatchesCount;
this._updateUIResultsCount();
}
}
_extractText() {
// Perform text extraction once if this method is called multiple times.
if (this._extractTextPromises.length > 0) {
return;
}
let promise = Promise.resolve();
for (let i = 0, ii = this._linkService.pagesCount; i < ii; i++) {
const extractTextCapability = createPromiseCapability();
this._extractTextPromises[i] = extractTextCapability.promise;
promise = promise.then(() => {
return this._pdfDocument
.getPage(i + 1)
.then(pdfPage => {
return pdfPage.getTextContent({
normalizeWhitespace: true,
});
})
.then(
textContent => {
const textItems = textContent.items;
const strBuf = [];
for (let j = 0, jj = textItems.length; j < jj; j++) {
strBuf.push(textItems[j].str);
}
// Store the normalized page content (text items) as one string.
this._pageContents[i] = normalize(strBuf.join(""));
extractTextCapability.resolve(i);
},
reason => {
console.error(
`Unable to get text content for page ${i + 1}`,
reason
);
// Page error -- assuming no text content.
this._pageContents[i] = "";
extractTextCapability.resolve(i);
}
);
});
}
}
_updatePage(index) {
if (this._scrollMatches && this._selected.pageIdx === index) {
// If the page is selected, scroll the page into view, which triggers
// rendering the page, which adds the text layer. Once the text layer
// is built, it will attempt to scroll the selected match into view.
this._linkService.page = index + 1;
}
this._eventBus.dispatch("updatetextlayermatches", {
source: this,
pageIndex: index,
});
}
_updateAllPages() {
this._eventBus.dispatch("updatetextlayermatches", {
source: this,
pageIndex: -1,
});
}
_nextMatch() {
const previous = this._state.findPrevious;
const currentPageIndex = this._linkService.page - 1;
const numPages = this._linkService.pagesCount;
this._highlightMatches = true;
if (this._dirtyMatch) {
// Need to recalculate the matches, reset everything.
this._dirtyMatch = false;
this._selected.pageIdx = this._selected.matchIdx = -1;
this._offset.pageIdx = currentPageIndex;
this._offset.matchIdx = null;
this._offset.wrapped = false;
this._resumePageIdx = null;
this._pageMatches.length = 0;
this._pageMatchesLength.length = 0;
this._matchesCountTotal = 0;
this._updateAllPages(); // Wipe out any previously highlighted matches.
for (let i = 0; i < numPages; i++) {
// Start finding the matches as soon as the text is extracted.
if (this._pendingFindMatches[i] === true) {
continue;
}
this._pendingFindMatches[i] = true;
this._extractTextPromises[i].then(pageIdx => {
delete this._pendingFindMatches[pageIdx];
this._calculateMatch(pageIdx);
});
}
}
// If there's no query there's no point in searching.
if (this._query === "") {
this._updateUIState(FindState.FOUND);
return;
}
// If we're waiting on a page, we return since we can't do anything else.
if (this._resumePageIdx) {
return;
}
const offset = this._offset;
// Keep track of how many pages we should maximally iterate through.
this._pagesToSearch = numPages;
// If there's already a `matchIdx` that means we are iterating through a
// page's matches.
if (offset.matchIdx !== null) {
const numPageMatches = this._pageMatches[offset.pageIdx].length;
if (
(!previous && offset.matchIdx + 1 < numPageMatches) ||
(previous && offset.matchIdx > 0)
) {
// The simple case; we just have advance the matchIdx to select
// the next match on the page.
offset.matchIdx = previous ? offset.matchIdx - 1 : offset.matchIdx + 1;
this._updateMatch(/* found = */ true);
return;
}
// We went beyond the current page's matches, so we advance to
// the next page.
this._advanceOffsetPage(previous);
}
// Start searching through the page.
this._nextPageMatch();
}
_matchesReady(matches) {
const offset = this._offset;
const numMatches = matches.length;
const previous = this._state.findPrevious;
if (numMatches) {
// There were matches for the page, so initialize `matchIdx`.
offset.matchIdx = previous ? numMatches - 1 : 0;
this._updateMatch(/* found = */ true);
return true;
}
// No matches, so attempt to search the next page.
this._advanceOffsetPage(previous);
if (offset.wrapped) {
offset.matchIdx = null;
if (this._pagesToSearch < 0) {
// No point in wrapping again, there were no matches.
this._updateMatch(/* found = */ false);
// While matches were not found, searching for a page
// with matches should nevertheless halt.
return true;
}
}
// Matches were not found (and searching is not done).
return false;
}
_nextPageMatch() {
if (this._resumePageIdx !== null) {
console.error("There can only be one pending page.");
}
let matches = null;
do {
const pageIdx = this._offset.pageIdx;
matches = this._pageMatches[pageIdx];
if (!matches) {
// The matches don't exist yet for processing by `_matchesReady`,
// so set a resume point for when they do exist.
this._resumePageIdx = pageIdx;
break;
}
} while (!this._matchesReady(matches));
}
_advanceOffsetPage(previous) {
const offset = this._offset;
const numPages = this._linkService.pagesCount;
offset.pageIdx = previous ? offset.pageIdx - 1 : offset.pageIdx + 1;
offset.matchIdx = null;
this._pagesToSearch--;
if (offset.pageIdx >= numPages || offset.pageIdx < 0) {
offset.pageIdx = previous ? numPages - 1 : 0;
offset.wrapped = true;
}
}
_updateMatch(found = false) {
let state = FindState.NOT_FOUND;
const wrapped = this._offset.wrapped;
this._offset.wrapped = false;
if (found) {
const previousPage = this._selected.pageIdx;
this._selected.pageIdx = this._offset.pageIdx;
this._selected.matchIdx = this._offset.matchIdx;
state = wrapped ? FindState.WRAPPED : FindState.FOUND;
// Update the currently selected page to wipe out any selected matches.
if (previousPage !== -1 && previousPage !== this._selected.pageIdx) {
this._updatePage(previousPage);
}
}
this._updateUIState(state, this._state.findPrevious);
if (this._selected.pageIdx !== -1) {
// Ensure that the match will be scrolled into view.
this._scrollMatches = true;
this._updatePage(this._selected.pageIdx);
}
}
_onFindBarClose(evt) {
const pdfDocument = this._pdfDocument;
// Since searching is asynchronous, ensure that the removal of highlighted
// matches (from the UI) is async too such that the 'updatetextlayermatches'
// events will always be dispatched in the expected order.
this._firstPageCapability.promise.then(() => {
// Only update the UI if the document is open, and is the current one.
if (
!this._pdfDocument ||
(pdfDocument && this._pdfDocument !== pdfDocument)
) {
return;
}
// Ensure that a pending, not yet started, search operation is aborted.
if (this._findTimeout) {
clearTimeout(this._findTimeout);
this._findTimeout = null;
}
// Abort any long running searches, to avoid a match being scrolled into
// view *after* the findbar has been closed. In this case `this._offset`
// will most likely differ from `this._selected`, hence we also ensure
// that any new search operation will always start with a clean slate.
if (this._resumePageIdx) {
this._resumePageIdx = null;
this._dirtyMatch = true;
}
// Avoid the UI being in a pending state when the findbar is re-opened.
this._updateUIState(FindState.FOUND);
this._highlightMatches = false;
this._updateAllPages(); // Wipe out any previously highlighted matches.
});
}
_requestMatchesCount() {
const { pageIdx, matchIdx } = this._selected;
let current = 0,
total = this._matchesCountTotal;
if (matchIdx !== -1) {
for (let i = 0; i < pageIdx; i++) {
current += (this._pageMatches[i] && this._pageMatches[i].length) || 0;
}
current += matchIdx + 1;
}
// When searching starts, this method may be called before the `pageMatches`
// have been counted (in `_calculateMatch`). Ensure that the UI won't show
// temporarily broken state when the active find result doesn't make sense.
if (current < 1 || current > total) {
current = total = 0;
}
return { current, total };
}
_updateUIResultsCount() {
this._eventBus.dispatch("updatefindmatchescount", {
source: this,
matchesCount: this._requestMatchesCount(),
});
}
_updateUIState(state, previous) {
this._eventBus.dispatch("updatefindcontrolstate", {
source: this,
state,
previous,
matchesCount: this._requestMatchesCount(),
});
}
}
export { FindState, PDFFindController };