/* Copyright 2012 Mozilla Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import { createPromiseCapability } from 'pdfjs-lib'; import { getCharacterType } from './pdf_find_utils'; import { getGlobalEventBus } from './dom_events'; import { scrollIntoView } from './ui_utils'; const FindState = { FOUND: 0, NOT_FOUND: 1, WRAPPED: 2, PENDING: 3, }; const FIND_SCROLL_OFFSET_TOP = -50; const FIND_SCROLL_OFFSET_LEFT = -400; const FIND_TIMEOUT = 250; // ms const CHARACTERS_TO_NORMALIZE = { '\u2018': '\'', // Left single quotation mark '\u2019': '\'', // Right single quotation mark '\u201A': '\'', // Single low-9 quotation mark '\u201B': '\'', // Single high-reversed-9 quotation mark '\u201C': '"', // Left double quotation mark '\u201D': '"', // Right double quotation mark '\u201E': '"', // Double low-9 quotation mark '\u201F': '"', // Double high-reversed-9 quotation mark '\u00BC': '1/4', // Vulgar fraction one quarter '\u00BD': '1/2', // Vulgar fraction one half '\u00BE': '3/4', // Vulgar fraction three quarters }; /** * Provides search functionality to find a given string in a PDF document. */ class PDFFindController { constructor({ pdfViewer, eventBus = getGlobalEventBus(), }) { this._pdfViewer = pdfViewer; this._eventBus = eventBus; this.onUpdateResultsCount = null; this.onUpdateState = null; this.reset(); // Compile the regular expression for text normalization once. let replace = Object.keys(CHARACTERS_TO_NORMALIZE).join(''); this._normalizationRegex = new RegExp('[' + replace + ']', 'g'); } get pageMatches() { return this._pageMatches; } get pageMatchesLength() { return this._pageMatchesLength; } get selected() { return this._selected; } get state() { return this._state; } reset() { this.active = false; // If active, find results will be highlighted. this._pageMatches = []; this._pageMatchesLength = null; this._state = null; this._selected = { // Currently selected match. pageIdx: -1, matchIdx: -1, }; this._offset = { // Where the find algorithm currently is in the document. pageIdx: null, matchIdx: null, }; this._extractTextPromises = []; this._pageContents = []; // Stores the text for each page. this._matchesCountTotal = 0; this._pagesToSearch = null; this._pendingFindMatches = Object.create(null); this._resumePageIdx = null; this._dirtyMatch = false; this._findTimeout = null; this._firstPagePromise = new Promise((resolve) => { const eventBus = this._eventBus; eventBus.on('pagesinit', function onPagesInit() { eventBus.off('pagesinit', onPagesInit); resolve(); }); }); } executeCommand(cmd, state) { if (this._state === null || cmd !== 'findagain') { this._dirtyMatch = true; } this._state = state; this._updateUIState(FindState.PENDING); this._firstPagePromise.then(() => { this._extractText(); clearTimeout(this._findTimeout); if (cmd === 'find') { // Trigger the find action with a small delay to avoid starting the // search when the user is still typing (saving resources). this._findTimeout = setTimeout(this._nextMatch.bind(this), FIND_TIMEOUT); } else { this._nextMatch(); } }); } /** * Called from the text layer when match presentation is updated. * * @param {number} pageIndex - The index of the page. * @param {number} matchIndex - The index of the match. * @param {Array} elements - Text layer `div` elements. * @param {number} beginIdx - Start index of the `div` array for the match. */ updateMatchPosition(pageIndex, matchIndex, elements, beginIdx) { if (this.selected.matchIdx === matchIndex && this.selected.pageIdx === pageIndex) { let spot = { top: FIND_SCROLL_OFFSET_TOP, left: FIND_SCROLL_OFFSET_LEFT, }; scrollIntoView(elements[beginIdx], spot, /* skipOverflowHiddenElements = */ true); } } _normalize(text) { return text.replace(this._normalizationRegex, function (ch) { return CHARACTERS_TO_NORMALIZE[ch]; }); } /** * Helper for multi-term search that fills the `matchesWithLength` array * and handles cases where one search term includes another search term (for * example, "tamed tame" or "this is"). It looks for intersecting terms in * the `matches` and keeps elements with a longer match length. */ _prepareMatches(matchesWithLength, matches, matchesLength) { function isSubTerm(matchesWithLength, currentIndex) { let currentElem = matchesWithLength[currentIndex]; let nextElem = matchesWithLength[currentIndex + 1]; // Check for cases like "TAMEd TAME". if (currentIndex < matchesWithLength.length - 1 && currentElem.match === nextElem.match) { currentElem.skipped = true; return true; } // Check for cases like "thIS IS". for (let i = currentIndex - 1; i >= 0; i--) { let prevElem = matchesWithLength[i]; if (prevElem.skipped) { continue; } if (prevElem.match + prevElem.matchLength < currentElem.match) { break; } if (prevElem.match + prevElem.matchLength >= currentElem.match + currentElem.matchLength) { currentElem.skipped = true; return true; } } return false; } // Sort the array of `{ match: , matchLength: }` // objects on increasing index first and on the length otherwise. matchesWithLength.sort(function(a, b) { return a.match === b.match ? a.matchLength - b.matchLength : a.match - b.match; }); for (let i = 0, len = matchesWithLength.length; i < len; i++) { if (isSubTerm(matchesWithLength, i)) { continue; } matches.push(matchesWithLength[i].match); matchesLength.push(matchesWithLength[i].matchLength); } } /** * Determine if the search query constitutes a "whole word", by comparing the * first/last character type with the preceding/following character type. */ _isEntireWord(content, startIdx, length) { if (startIdx > 0) { const first = content.charCodeAt(startIdx); const limit = content.charCodeAt(startIdx - 1); if (getCharacterType(first) === getCharacterType(limit)) { return false; } } const endIdx = (startIdx + length - 1); if (endIdx < (content.length - 1)) { const last = content.charCodeAt(endIdx); const limit = content.charCodeAt(endIdx + 1); if (getCharacterType(last) === getCharacterType(limit)) { return false; } } return true; } _calculatePhraseMatch(query, pageIndex, pageContent, entireWord) { let matches = []; let queryLen = query.length; let matchIdx = -queryLen; while (true) { matchIdx = pageContent.indexOf(query, matchIdx + queryLen); if (matchIdx === -1) { break; } if (entireWord && !this._isEntireWord(pageContent, matchIdx, queryLen)) { continue; } matches.push(matchIdx); } this._pageMatches[pageIndex] = matches; } _calculateWordMatch(query, pageIndex, pageContent, entireWord) { let matchesWithLength = []; // Divide the query into pieces and search for text in each piece. let queryArray = query.match(/\S+/g); for (let i = 0, len = queryArray.length; i < len; i++) { let subquery = queryArray[i]; let subqueryLen = subquery.length; let matchIdx = -subqueryLen; while (true) { matchIdx = pageContent.indexOf(subquery, matchIdx + subqueryLen); if (matchIdx === -1) { break; } if (entireWord && !this._isEntireWord(pageContent, matchIdx, subqueryLen)) { continue; } // Other searches do not, so we store the length. matchesWithLength.push({ match: matchIdx, matchLength: subqueryLen, skipped: false, }); } } // Prepare arrays for storing the matches. if (!this._pageMatchesLength) { this._pageMatchesLength = []; } this._pageMatchesLength[pageIndex] = []; this._pageMatches[pageIndex] = []; // Sort `matchesWithLength`, remove intersecting terms and put the result // into the two arrays. this._prepareMatches(matchesWithLength, this._pageMatches[pageIndex], this._pageMatchesLength[pageIndex]); } _calculateMatch(pageIndex) { let pageContent = this._normalize(this._pageContents[pageIndex]); let query = this._normalize(this._state.query); let caseSensitive = this._state.caseSensitive; let phraseSearch = this._state.phraseSearch; const entireWord = this._state.entireWord; let queryLen = query.length; if (queryLen === 0) { // Do nothing: the matches should be wiped out already. return; } if (!caseSensitive) { pageContent = pageContent.toLowerCase(); query = query.toLowerCase(); } if (phraseSearch) { this._calculatePhraseMatch(query, pageIndex, pageContent, entireWord); } else { this._calculateWordMatch(query, pageIndex, pageContent, entireWord); } this._updatePage(pageIndex); if (this._resumePageIdx === pageIndex) { this._resumePageIdx = null; this._nextPageMatch(); } // Update the match count. const pageMatchesCount = this._pageMatches[pageIndex].length; if (pageMatchesCount > 0) { this._matchesCountTotal += pageMatchesCount; this._updateUIResultsCount(); } } _extractText() { // Perform text extraction once if this method is called multiple times. if (this._extractTextPromises.length > 0) { return; } let promise = Promise.resolve(); for (let i = 0, ii = this._pdfViewer.pagesCount; i < ii; i++) { let extractTextCapability = createPromiseCapability(); this._extractTextPromises[i] = extractTextCapability.promise; promise = promise.then(() => { return this._pdfViewer.getPageTextContent(i).then((textContent) => { let textItems = textContent.items; let strBuf = []; for (let j = 0, jj = textItems.length; j < jj; j++) { strBuf.push(textItems[j].str); } // Store the pageContent as a string. this._pageContents[i] = strBuf.join(''); extractTextCapability.resolve(i); }, (reason) => { console.error(`Unable to get page ${i + 1} text content`, reason); // Page error -- assuming no text content. this._pageContents[i] = ''; extractTextCapability.resolve(i); }); }); } } _updatePage(index) { if (this._selected.pageIdx === index) { // If the page is selected, scroll the page into view, which triggers // rendering the page, which adds the textLayer. Once the textLayer is // build, it will scroll onto the selected match. this._pdfViewer.currentPageNumber = index + 1; } let page = this._pdfViewer.getPageView(index); if (page.textLayer) { page.textLayer.updateMatches(); } } _nextMatch() { let previous = this._state.findPrevious; let currentPageIndex = this._pdfViewer.currentPageNumber - 1; let numPages = this._pdfViewer.pagesCount; this.active = true; if (this._dirtyMatch) { // Need to recalculate the matches, reset everything. this._dirtyMatch = false; this._selected.pageIdx = this._selected.matchIdx = -1; this._offset.pageIdx = currentPageIndex; this._offset.matchIdx = null; this._resumePageIdx = null; this._pageMatches = []; this._matchesCountTotal = 0; this._pageMatchesLength = null; for (let i = 0; i < numPages; i++) { // Wipe out any previously highlighted matches. this._updatePage(i); // Start finding the matches as soon as the text is extracted. if (!(i in this._pendingFindMatches)) { this._pendingFindMatches[i] = true; this._extractTextPromises[i].then((pageIdx) => { delete this._pendingFindMatches[pageIdx]; this._calculateMatch(pageIdx); }); } } } // If there's no query there's no point in searching. if (this._state.query === '') { this._updateUIState(FindState.FOUND); return; } // If we're waiting on a page, we return since we can't do anything else. if (this._resumePageIdx) { return; } let offset = this._offset; // Keep track of how many pages we should maximally iterate through. this._pagesToSearch = numPages; // If there's already a `matchIdx` that means we are iterating through a // page's matches. if (offset.matchIdx !== null) { let numPageMatches = this._pageMatches[offset.pageIdx].length; if ((!previous && offset.matchIdx + 1 < numPageMatches) || (previous && offset.matchIdx > 0)) { // The simple case; we just have advance the matchIdx to select // the next match on the page. offset.matchIdx = (previous ? offset.matchIdx - 1 : offset.matchIdx + 1); this._updateMatch(/* found = */ true); return; } // We went beyond the current page's matches, so we advance to // the next page. this._advanceOffsetPage(previous); } // Start searching through the page. this._nextPageMatch(); } _matchesReady(matches) { let offset = this._offset; let numMatches = matches.length; let previous = this._state.findPrevious; if (numMatches) { // There were matches for the page, so initialize `matchIdx`. offset.matchIdx = (previous ? numMatches - 1 : 0); this._updateMatch(/* found = */ true); return true; } // No matches, so attempt to search the next page. this._advanceOffsetPage(previous); if (offset.wrapped) { offset.matchIdx = null; if (this._pagesToSearch < 0) { // No point in wrapping again, there were no matches. this._updateMatch(/* found = */ false); // While matches were not found, searching for a page // with matches should nevertheless halt. return true; } } // Matches were not found (and searching is not done). return false; } _nextPageMatch() { if (this._resumePageIdx !== null) { console.error('There can only be one pending page.'); } let matches = null; do { let pageIdx = this._offset.pageIdx; matches = this._pageMatches[pageIdx]; if (!matches) { // The matches don't exist yet for processing by `_matchesReady`, // so set a resume point for when they do exist. this._resumePageIdx = pageIdx; break; } } while (!this._matchesReady(matches)); } _advanceOffsetPage(previous) { let offset = this._offset; let numPages = this._extractTextPromises.length; offset.pageIdx = (previous ? offset.pageIdx - 1 : offset.pageIdx + 1); offset.matchIdx = null; this._pagesToSearch--; if (offset.pageIdx >= numPages || offset.pageIdx < 0) { offset.pageIdx = (previous ? numPages - 1 : 0); offset.wrapped = true; } } _updateMatch(found = false) { let state = FindState.NOT_FOUND; let wrapped = this._offset.wrapped; this._offset.wrapped = false; if (found) { let previousPage = this._selected.pageIdx; this.selected.pageIdx = this._offset.pageIdx; this.selected.matchIdx = this._offset.matchIdx; state = (wrapped ? FindState.WRAPPED : FindState.FOUND); // Update the currently selected page to wipe out any selected matches. if (previousPage !== -1 && previousPage !== this._selected.pageIdx) { this._updatePage(previousPage); } } this._updateUIState(state, this._state.findPrevious); if (this._selected.pageIdx !== -1) { this._updatePage(this._selected.pageIdx); } } _requestMatchesCount() { const { pageIdx, matchIdx, } = this._selected; let current = 0, total = this._matchesCountTotal; if (matchIdx !== -1) { for (let i = 0; i < pageIdx; i++) { current += (this._pageMatches[i] && this._pageMatches[i].length) || 0; } current += matchIdx + 1; } // When searching starts, this method may be called before the `pageMatches` // have been counted (in `_calculateMatch`). Ensure that the UI won't show // temporarily broken state when the active find result doesn't make sense. if (current > total) { current = total = 0; } return { current, total, }; } _updateUIResultsCount() { if (!this.onUpdateResultsCount) { return; } const matchesCount = this._requestMatchesCount(); this.onUpdateResultsCount(matchesCount); } _updateUIState(state, previous) { if (!this.onUpdateState) { return; } const matchesCount = this._requestMatchesCount(); this.onUpdateState(state, previous, matchesCount); } } export { FindState, PDFFindController, };