1b402996cf
This commit shows that we can now unit test the find controller and that executing regular queries works. Note that this is only a first step and not a complete suite of unit tests for all possible options of the find controller. While writing this unit test, I found two smaller issues that I addressed directly. The first one is that in the previous find controller refactoring I forgot to rename some occurrences of a now private member variable. Fortunately this did not cause any bugs since we did have a public getter and the fetched value may be changed by reference, but it's nevertheless good to fix. The second issue is that some entries in the `test/unit/clitests.json` file were not correct, resulting in these tests not being executed on e.g., Travis CI.
584 lines
18 KiB
JavaScript
584 lines
18 KiB
JavaScript
/* Copyright 2012 Mozilla Foundation
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
import { createPromiseCapability } from 'pdfjs-lib';
|
|
import { getCharacterType } from './pdf_find_utils';
|
|
import { getGlobalEventBus } from './dom_events';
|
|
|
|
const FindState = {
|
|
FOUND: 0,
|
|
NOT_FOUND: 1,
|
|
WRAPPED: 2,
|
|
PENDING: 3,
|
|
};
|
|
|
|
const FIND_TIMEOUT = 250; // ms
|
|
|
|
const CHARACTERS_TO_NORMALIZE = {
|
|
'\u2018': '\'', // Left single quotation mark
|
|
'\u2019': '\'', // Right single quotation mark
|
|
'\u201A': '\'', // Single low-9 quotation mark
|
|
'\u201B': '\'', // Single high-reversed-9 quotation mark
|
|
'\u201C': '"', // Left double quotation mark
|
|
'\u201D': '"', // Right double quotation mark
|
|
'\u201E': '"', // Double low-9 quotation mark
|
|
'\u201F': '"', // Double high-reversed-9 quotation mark
|
|
'\u00BC': '1/4', // Vulgar fraction one quarter
|
|
'\u00BD': '1/2', // Vulgar fraction one half
|
|
'\u00BE': '3/4', // Vulgar fraction three quarters
|
|
};
|
|
|
|
/**
|
|
* @typedef {Object} PDFFindControllerOptions
|
|
* @property {IPDFLinkService} linkService - The navigation/linking service.
|
|
* @property {EventBus} eventBus - The application event bus.
|
|
*/
|
|
|
|
/**
|
|
* Provides search functionality to find a given string in a PDF document.
|
|
*/
|
|
class PDFFindController {
|
|
/**
|
|
* @param {PDFFindControllerOptions} options
|
|
*/
|
|
constructor({ linkService, eventBus = getGlobalEventBus(), }) {
|
|
this._linkService = linkService;
|
|
this._eventBus = eventBus;
|
|
|
|
this._reset();
|
|
|
|
eventBus.on('findbarclose', () => {
|
|
this._highlightMatches = false;
|
|
|
|
eventBus.dispatch('updatetextlayermatches', {
|
|
source: this,
|
|
pageIndex: -1,
|
|
});
|
|
});
|
|
|
|
// Compile the regular expression for text normalization once.
|
|
const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join('');
|
|
this._normalizationRegex = new RegExp(`[${replace}]`, 'g');
|
|
}
|
|
|
|
get highlightMatches() {
|
|
return this._highlightMatches;
|
|
}
|
|
|
|
get pageMatches() {
|
|
return this._pageMatches;
|
|
}
|
|
|
|
get pageMatchesLength() {
|
|
return this._pageMatchesLength;
|
|
}
|
|
|
|
get selected() {
|
|
return this._selected;
|
|
}
|
|
|
|
get state() {
|
|
return this._state;
|
|
}
|
|
|
|
/**
|
|
* Set a reference to the PDF document in order to search it.
|
|
* Note that searching is not possible if this method is not called.
|
|
*
|
|
* @param {PDFDocumentProxy} pdfDocument - The PDF document to search.
|
|
*/
|
|
setDocument(pdfDocument) {
|
|
if (this._pdfDocument) {
|
|
this._reset();
|
|
}
|
|
if (!pdfDocument) {
|
|
return;
|
|
}
|
|
this._pdfDocument = pdfDocument;
|
|
}
|
|
|
|
executeCommand(cmd, state) {
|
|
if (!this._pdfDocument) {
|
|
return;
|
|
}
|
|
|
|
if (this._state === null || cmd !== 'findagain') {
|
|
this._dirtyMatch = true;
|
|
}
|
|
this._state = state;
|
|
this._updateUIState(FindState.PENDING);
|
|
|
|
this._firstPagePromise.then(() => {
|
|
this._extractText();
|
|
|
|
clearTimeout(this._findTimeout);
|
|
if (cmd === 'find') {
|
|
// Trigger the find action with a small delay to avoid starting the
|
|
// search when the user is still typing (saving resources).
|
|
this._findTimeout =
|
|
setTimeout(this._nextMatch.bind(this), FIND_TIMEOUT);
|
|
} else {
|
|
this._nextMatch();
|
|
}
|
|
});
|
|
}
|
|
|
|
_reset() {
|
|
this._highlightMatches = false;
|
|
this._pdfDocument = null;
|
|
this._pageMatches = [];
|
|
this._pageMatchesLength = null;
|
|
this._state = null;
|
|
this._selected = { // Currently selected match.
|
|
pageIdx: -1,
|
|
matchIdx: -1,
|
|
};
|
|
this._offset = { // Where the find algorithm currently is in the document.
|
|
pageIdx: null,
|
|
matchIdx: null,
|
|
};
|
|
this._extractTextPromises = [];
|
|
this._pageContents = []; // Stores the text for each page.
|
|
this._matchesCountTotal = 0;
|
|
this._pagesToSearch = null;
|
|
this._pendingFindMatches = Object.create(null);
|
|
this._resumePageIdx = null;
|
|
this._dirtyMatch = false;
|
|
this._findTimeout = null;
|
|
|
|
this._firstPagePromise = new Promise((resolve) => {
|
|
const eventBus = this._eventBus;
|
|
eventBus.on('pagesinit', function onPagesInit() {
|
|
eventBus.off('pagesinit', onPagesInit);
|
|
resolve();
|
|
});
|
|
});
|
|
}
|
|
|
|
_normalize(text) {
|
|
return text.replace(this._normalizationRegex, function(ch) {
|
|
return CHARACTERS_TO_NORMALIZE[ch];
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Helper for multi-term search that fills the `matchesWithLength` array
|
|
* and handles cases where one search term includes another search term (for
|
|
* example, "tamed tame" or "this is"). It looks for intersecting terms in
|
|
* the `matches` and keeps elements with a longer match length.
|
|
*/
|
|
_prepareMatches(matchesWithLength, matches, matchesLength) {
|
|
function isSubTerm(matchesWithLength, currentIndex) {
|
|
const currentElem = matchesWithLength[currentIndex];
|
|
const nextElem = matchesWithLength[currentIndex + 1];
|
|
|
|
// Check for cases like "TAMEd TAME".
|
|
if (currentIndex < matchesWithLength.length - 1 &&
|
|
currentElem.match === nextElem.match) {
|
|
currentElem.skipped = true;
|
|
return true;
|
|
}
|
|
|
|
// Check for cases like "thIS IS".
|
|
for (let i = currentIndex - 1; i >= 0; i--) {
|
|
const prevElem = matchesWithLength[i];
|
|
if (prevElem.skipped) {
|
|
continue;
|
|
}
|
|
if (prevElem.match + prevElem.matchLength < currentElem.match) {
|
|
break;
|
|
}
|
|
if (prevElem.match + prevElem.matchLength >=
|
|
currentElem.match + currentElem.matchLength) {
|
|
currentElem.skipped = true;
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// Sort the array of `{ match: <match>, matchLength: <matchLength> }`
|
|
// objects on increasing index first and on the length otherwise.
|
|
matchesWithLength.sort(function(a, b) {
|
|
return a.match === b.match ? a.matchLength - b.matchLength :
|
|
a.match - b.match;
|
|
});
|
|
for (let i = 0, len = matchesWithLength.length; i < len; i++) {
|
|
if (isSubTerm(matchesWithLength, i)) {
|
|
continue;
|
|
}
|
|
matches.push(matchesWithLength[i].match);
|
|
matchesLength.push(matchesWithLength[i].matchLength);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Determine if the search query constitutes a "whole word", by comparing the
|
|
* first/last character type with the preceding/following character type.
|
|
*/
|
|
_isEntireWord(content, startIdx, length) {
|
|
if (startIdx > 0) {
|
|
const first = content.charCodeAt(startIdx);
|
|
const limit = content.charCodeAt(startIdx - 1);
|
|
if (getCharacterType(first) === getCharacterType(limit)) {
|
|
return false;
|
|
}
|
|
}
|
|
const endIdx = (startIdx + length - 1);
|
|
if (endIdx < (content.length - 1)) {
|
|
const last = content.charCodeAt(endIdx);
|
|
const limit = content.charCodeAt(endIdx + 1);
|
|
if (getCharacterType(last) === getCharacterType(limit)) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
_calculatePhraseMatch(query, pageIndex, pageContent, entireWord) {
|
|
const matches = [];
|
|
const queryLen = query.length;
|
|
|
|
let matchIdx = -queryLen;
|
|
while (true) {
|
|
matchIdx = pageContent.indexOf(query, matchIdx + queryLen);
|
|
if (matchIdx === -1) {
|
|
break;
|
|
}
|
|
if (entireWord && !this._isEntireWord(pageContent, matchIdx, queryLen)) {
|
|
continue;
|
|
}
|
|
matches.push(matchIdx);
|
|
}
|
|
this._pageMatches[pageIndex] = matches;
|
|
}
|
|
|
|
_calculateWordMatch(query, pageIndex, pageContent, entireWord) {
|
|
const matchesWithLength = [];
|
|
|
|
// Divide the query into pieces and search for text in each piece.
|
|
const queryArray = query.match(/\S+/g);
|
|
for (let i = 0, len = queryArray.length; i < len; i++) {
|
|
const subquery = queryArray[i];
|
|
const subqueryLen = subquery.length;
|
|
|
|
let matchIdx = -subqueryLen;
|
|
while (true) {
|
|
matchIdx = pageContent.indexOf(subquery, matchIdx + subqueryLen);
|
|
if (matchIdx === -1) {
|
|
break;
|
|
}
|
|
if (entireWord &&
|
|
!this._isEntireWord(pageContent, matchIdx, subqueryLen)) {
|
|
continue;
|
|
}
|
|
// Other searches do not, so we store the length.
|
|
matchesWithLength.push({
|
|
match: matchIdx,
|
|
matchLength: subqueryLen,
|
|
skipped: false,
|
|
});
|
|
}
|
|
}
|
|
|
|
// Prepare arrays for storing the matches.
|
|
if (!this._pageMatchesLength) {
|
|
this._pageMatchesLength = [];
|
|
}
|
|
this._pageMatchesLength[pageIndex] = [];
|
|
this._pageMatches[pageIndex] = [];
|
|
|
|
// Sort `matchesWithLength`, remove intersecting terms and put the result
|
|
// into the two arrays.
|
|
this._prepareMatches(matchesWithLength, this._pageMatches[pageIndex],
|
|
this._pageMatchesLength[pageIndex]);
|
|
}
|
|
|
|
_calculateMatch(pageIndex) {
|
|
let pageContent = this._normalize(this._pageContents[pageIndex]);
|
|
let query = this._normalize(this._state.query);
|
|
const { caseSensitive, entireWord, phraseSearch, } = this._state;
|
|
|
|
if (query.length === 0) {
|
|
// Do nothing: the matches should be wiped out already.
|
|
return;
|
|
}
|
|
|
|
if (!caseSensitive) {
|
|
pageContent = pageContent.toLowerCase();
|
|
query = query.toLowerCase();
|
|
}
|
|
|
|
if (phraseSearch) {
|
|
this._calculatePhraseMatch(query, pageIndex, pageContent, entireWord);
|
|
} else {
|
|
this._calculateWordMatch(query, pageIndex, pageContent, entireWord);
|
|
}
|
|
|
|
this._updatePage(pageIndex);
|
|
if (this._resumePageIdx === pageIndex) {
|
|
this._resumePageIdx = null;
|
|
this._nextPageMatch();
|
|
}
|
|
|
|
// Update the match count.
|
|
const pageMatchesCount = this._pageMatches[pageIndex].length;
|
|
if (pageMatchesCount > 0) {
|
|
this._matchesCountTotal += pageMatchesCount;
|
|
this._updateUIResultsCount();
|
|
}
|
|
}
|
|
|
|
_extractText() {
|
|
// Perform text extraction once if this method is called multiple times.
|
|
if (this._extractTextPromises.length > 0) {
|
|
return;
|
|
}
|
|
|
|
let promise = Promise.resolve();
|
|
for (let i = 0, ii = this._linkService.pagesCount; i < ii; i++) {
|
|
const extractTextCapability = createPromiseCapability();
|
|
this._extractTextPromises[i] = extractTextCapability.promise;
|
|
|
|
promise = promise.then(() => {
|
|
return this._pdfDocument.getPage(i + 1).then((pdfPage) => {
|
|
return pdfPage.getTextContent({
|
|
normalizeWhitespace: true,
|
|
});
|
|
}).then((textContent) => {
|
|
const textItems = textContent.items;
|
|
const strBuf = [];
|
|
|
|
for (let j = 0, jj = textItems.length; j < jj; j++) {
|
|
strBuf.push(textItems[j].str);
|
|
}
|
|
|
|
// Store the page content (text items) as one string.
|
|
this._pageContents[i] = strBuf.join('');
|
|
extractTextCapability.resolve(i);
|
|
}, (reason) => {
|
|
console.error(`Unable to get text content for page ${i + 1}`, reason);
|
|
// Page error -- assuming no text content.
|
|
this._pageContents[i] = '';
|
|
extractTextCapability.resolve(i);
|
|
});
|
|
});
|
|
}
|
|
}
|
|
|
|
_updatePage(index) {
|
|
if (this._selected.pageIdx === index) {
|
|
// If the page is selected, scroll the page into view, which triggers
|
|
// rendering the page, which adds the text layer. Once the text layer
|
|
// is built, it will scroll to the selected match.
|
|
this._linkService.page = index + 1;
|
|
}
|
|
|
|
this._eventBus.dispatch('updatetextlayermatches', {
|
|
source: this,
|
|
pageIndex: index,
|
|
});
|
|
}
|
|
|
|
_nextMatch() {
|
|
const previous = this._state.findPrevious;
|
|
const currentPageIndex = this._linkService.page - 1;
|
|
const numPages = this._linkService.pagesCount;
|
|
|
|
this._highlightMatches = true;
|
|
|
|
if (this._dirtyMatch) {
|
|
// Need to recalculate the matches, reset everything.
|
|
this._dirtyMatch = false;
|
|
this._selected.pageIdx = this._selected.matchIdx = -1;
|
|
this._offset.pageIdx = currentPageIndex;
|
|
this._offset.matchIdx = null;
|
|
this._resumePageIdx = null;
|
|
this._pageMatches.length = 0;
|
|
this._pageMatchesLength = null;
|
|
this._matchesCountTotal = 0;
|
|
|
|
for (let i = 0; i < numPages; i++) {
|
|
// Wipe out any previously highlighted matches.
|
|
this._updatePage(i);
|
|
|
|
// Start finding the matches as soon as the text is extracted.
|
|
if (!(i in this._pendingFindMatches)) {
|
|
this._pendingFindMatches[i] = true;
|
|
this._extractTextPromises[i].then((pageIdx) => {
|
|
delete this._pendingFindMatches[pageIdx];
|
|
this._calculateMatch(pageIdx);
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
// If there's no query there's no point in searching.
|
|
if (this._state.query === '') {
|
|
this._updateUIState(FindState.FOUND);
|
|
return;
|
|
}
|
|
|
|
// If we're waiting on a page, we return since we can't do anything else.
|
|
if (this._resumePageIdx) {
|
|
return;
|
|
}
|
|
|
|
const offset = this._offset;
|
|
// Keep track of how many pages we should maximally iterate through.
|
|
this._pagesToSearch = numPages;
|
|
// If there's already a `matchIdx` that means we are iterating through a
|
|
// page's matches.
|
|
if (offset.matchIdx !== null) {
|
|
const numPageMatches = this._pageMatches[offset.pageIdx].length;
|
|
if ((!previous && offset.matchIdx + 1 < numPageMatches) ||
|
|
(previous && offset.matchIdx > 0)) {
|
|
// The simple case; we just have advance the matchIdx to select
|
|
// the next match on the page.
|
|
offset.matchIdx = (previous ? offset.matchIdx - 1 :
|
|
offset.matchIdx + 1);
|
|
this._updateMatch(/* found = */ true);
|
|
return;
|
|
}
|
|
// We went beyond the current page's matches, so we advance to
|
|
// the next page.
|
|
this._advanceOffsetPage(previous);
|
|
}
|
|
// Start searching through the page.
|
|
this._nextPageMatch();
|
|
}
|
|
|
|
_matchesReady(matches) {
|
|
const offset = this._offset;
|
|
const numMatches = matches.length;
|
|
const previous = this._state.findPrevious;
|
|
|
|
if (numMatches) {
|
|
// There were matches for the page, so initialize `matchIdx`.
|
|
offset.matchIdx = (previous ? numMatches - 1 : 0);
|
|
this._updateMatch(/* found = */ true);
|
|
return true;
|
|
}
|
|
// No matches, so attempt to search the next page.
|
|
this._advanceOffsetPage(previous);
|
|
if (offset.wrapped) {
|
|
offset.matchIdx = null;
|
|
if (this._pagesToSearch < 0) {
|
|
// No point in wrapping again, there were no matches.
|
|
this._updateMatch(/* found = */ false);
|
|
// While matches were not found, searching for a page
|
|
// with matches should nevertheless halt.
|
|
return true;
|
|
}
|
|
}
|
|
// Matches were not found (and searching is not done).
|
|
return false;
|
|
}
|
|
|
|
_nextPageMatch() {
|
|
if (this._resumePageIdx !== null) {
|
|
console.error('There can only be one pending page.');
|
|
}
|
|
|
|
let matches = null;
|
|
do {
|
|
const pageIdx = this._offset.pageIdx;
|
|
matches = this._pageMatches[pageIdx];
|
|
if (!matches) {
|
|
// The matches don't exist yet for processing by `_matchesReady`,
|
|
// so set a resume point for when they do exist.
|
|
this._resumePageIdx = pageIdx;
|
|
break;
|
|
}
|
|
} while (!this._matchesReady(matches));
|
|
}
|
|
|
|
_advanceOffsetPage(previous) {
|
|
const offset = this._offset;
|
|
const numPages = this._linkService.pagesCount;
|
|
offset.pageIdx = (previous ? offset.pageIdx - 1 : offset.pageIdx + 1);
|
|
offset.matchIdx = null;
|
|
|
|
this._pagesToSearch--;
|
|
|
|
if (offset.pageIdx >= numPages || offset.pageIdx < 0) {
|
|
offset.pageIdx = (previous ? numPages - 1 : 0);
|
|
offset.wrapped = true;
|
|
}
|
|
}
|
|
|
|
_updateMatch(found = false) {
|
|
let state = FindState.NOT_FOUND;
|
|
const wrapped = this._offset.wrapped;
|
|
this._offset.wrapped = false;
|
|
|
|
if (found) {
|
|
const previousPage = this._selected.pageIdx;
|
|
this._selected.pageIdx = this._offset.pageIdx;
|
|
this._selected.matchIdx = this._offset.matchIdx;
|
|
state = (wrapped ? FindState.WRAPPED : FindState.FOUND);
|
|
|
|
// Update the currently selected page to wipe out any selected matches.
|
|
if (previousPage !== -1 && previousPage !== this._selected.pageIdx) {
|
|
this._updatePage(previousPage);
|
|
}
|
|
}
|
|
|
|
this._updateUIState(state, this._state.findPrevious);
|
|
if (this._selected.pageIdx !== -1) {
|
|
this._updatePage(this._selected.pageIdx);
|
|
}
|
|
}
|
|
|
|
_requestMatchesCount() {
|
|
const { pageIdx, matchIdx, } = this._selected;
|
|
let current = 0, total = this._matchesCountTotal;
|
|
if (matchIdx !== -1) {
|
|
for (let i = 0; i < pageIdx; i++) {
|
|
current += (this._pageMatches[i] && this._pageMatches[i].length) || 0;
|
|
}
|
|
current += matchIdx + 1;
|
|
}
|
|
// When searching starts, this method may be called before the `pageMatches`
|
|
// have been counted (in `_calculateMatch`). Ensure that the UI won't show
|
|
// temporarily broken state when the active find result doesn't make sense.
|
|
if (current < 1 || current > total) {
|
|
current = total = 0;
|
|
}
|
|
return { current, total, };
|
|
}
|
|
|
|
_updateUIResultsCount() {
|
|
this._eventBus.dispatch('updatefindmatchescount', {
|
|
source: this,
|
|
matchesCount: this._requestMatchesCount(),
|
|
});
|
|
}
|
|
|
|
_updateUIState(state, previous) {
|
|
this._eventBus.dispatch('updatefindcontrolstate', {
|
|
source: this,
|
|
state,
|
|
previous,
|
|
matchesCount: this._requestMatchesCount(),
|
|
});
|
|
}
|
|
}
|
|
|
|
export {
|
|
FindState,
|
|
PDFFindController,
|
|
};
|