pdf.js/web/pdf_find_controller.js

/* Copyright 2012 Mozilla Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import { createPromiseCapability } from 'pdfjs-lib';
import { getCharacterType } from './pdf_find_utils';
import { getGlobalEventBus } from './dom_events';

const FindState = {
  FOUND: 0,
  NOT_FOUND: 1,
  WRAPPED: 2,
  PENDING: 3,
};

const FIND_TIMEOUT = 250; // ms

const CHARACTERS_TO_NORMALIZE = {
  '\u2018': '\'', // Left single quotation mark
  '\u2019': '\'', // Right single quotation mark
  '\u201A': '\'', // Single low-9 quotation mark
  '\u201B': '\'', // Single high-reversed-9 quotation mark
  '\u201C': '"', // Left double quotation mark
  '\u201D': '"', // Right double quotation mark
  '\u201E': '"', // Double low-9 quotation mark
  '\u201F': '"', // Double high-reversed-9 quotation mark
  '\u00BC': '1/4', // Vulgar fraction one quarter
  '\u00BD': '1/2', // Vulgar fraction one half
  '\u00BE': '3/4', // Vulgar fraction three quarters
};

/**
 * Provides search functionality to find a given string in a PDF document.
 */
class PDFFindController {
  constructor({ pdfViewer, eventBus = getGlobalEventBus(), }) {
    this._pdfViewer = pdfViewer;
    this._eventBus = eventBus;

    this.onUpdateResultsCount = null;
    this.onUpdateState = null;

    this._reset();

    eventBus.on('findbarclose', () => {
      this._highlightMatches = false;

      eventBus.dispatch('updatetextlayermatches', {
        source: this,
        pageIndex: -1,
      });
    });

    // Compile the regular expression for text normalization once.
    const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join('');
    this._normalizationRegex = new RegExp(`[${replace}]`, 'g');
  }

  get highlightMatches() {
    return this._highlightMatches;
  }

  get pageMatches() {
    return this._pageMatches;
  }

  get pageMatchesLength() {
    return this._pageMatchesLength;
  }

  get selected() {
    return this._selected;
  }

  get state() {
    return this._state;
  }

  /**
   * Set a reference to the PDF document in order to search it.
   * Note that searching is not possible if this method is not called.
   *
   * @param {PDFDocumentProxy} pdfDocument - The PDF document to search.
   */
  setDocument(pdfDocument) {
    if (this._pdfDocument) {
      this._reset();
    }
    if (!pdfDocument) {
      return;
    }
    this._pdfDocument = pdfDocument;
  }

  executeCommand(cmd, state) {
    if (!this._pdfDocument) {
      return;
    }

    if (this._state === null || cmd !== 'findagain') {
      this._dirtyMatch = true;
    }
    this._state = state;
    this._updateUIState(FindState.PENDING);

    this._firstPagePromise.then(() => {
      this._extractText();

      clearTimeout(this._findTimeout);
      if (cmd === 'find') {
        // Trigger the find action with a small delay to avoid starting the
        // search when the user is still typing (saving resources).
        this._findTimeout =
          setTimeout(this._nextMatch.bind(this), FIND_TIMEOUT);
      } else {
        this._nextMatch();
      }
    });
  }

  _reset() {
    this._highlightMatches = false;
    this._pdfDocument = null;
    this._pageMatches = [];
    this._pageMatchesLength = null;
    this._state = null;
    this._selected = { // Currently selected match.
      pageIdx: -1,
      matchIdx: -1,
    };
    this._offset = { // Where the find algorithm currently is in the document.
      pageIdx: null,
      matchIdx: null,
    };
    this._extractTextPromises = [];
    this._pageContents = []; // Stores the text for each page.
    this._matchesCountTotal = 0;
    this._pagesToSearch = null;
    this._pendingFindMatches = Object.create(null);
    this._resumePageIdx = null;
    this._dirtyMatch = false;
    this._findTimeout = null;

    this._firstPagePromise = new Promise((resolve) => {
      const eventBus = this._eventBus;
      eventBus.on('pagesinit', function onPagesInit() {
        eventBus.off('pagesinit', onPagesInit);
        resolve();
      });
    });
  }

  _normalize(text) {
    return text.replace(this._normalizationRegex, function(ch) {
      return CHARACTERS_TO_NORMALIZE[ch];
    });
  }

  /**
   * Helper for multi-term search that fills the `matchesWithLength` array
   * and handles cases where one search term includes another search term (for
   * example, "tamed tame" or "this is"). It looks for intersecting terms in
   * the `matches` and keeps elements with a longer match length.
   */
  _prepareMatches(matchesWithLength, matches, matchesLength) {
    function isSubTerm(matchesWithLength, currentIndex) {
      const currentElem = matchesWithLength[currentIndex];
      const nextElem = matchesWithLength[currentIndex + 1];

      // Check for cases like "TAMEd TAME".
      if (currentIndex < matchesWithLength.length - 1 &&
          currentElem.match === nextElem.match) {
        currentElem.skipped = true;
        return true;
      }

      // Check for cases like "thIS IS".
      for (let i = currentIndex - 1; i >= 0; i--) {
        const prevElem = matchesWithLength[i];
        if (prevElem.skipped) {
          continue;
        }
        if (prevElem.match + prevElem.matchLength < currentElem.match) {
          break;
        }
        if (prevElem.match + prevElem.matchLength >=
            currentElem.match + currentElem.matchLength) {
          currentElem.skipped = true;
          return true;
        }
      }
      return false;
    }

    // Sort the array of `{ match: <match>, matchLength: <matchLength> }`
    // objects on increasing index first and on the length otherwise.
    matchesWithLength.sort(function(a, b) {
      return a.match === b.match ? a.matchLength - b.matchLength :
                                   a.match - b.match;
    });
    for (let i = 0, len = matchesWithLength.length; i < len; i++) {
      if (isSubTerm(matchesWithLength, i)) {
        continue;
      }
      matches.push(matchesWithLength[i].match);
      matchesLength.push(matchesWithLength[i].matchLength);
    }
  }

  /**
   * Determine if the search query constitutes a "whole word", by comparing the
   * first/last character type with the preceding/following character type.
   */
  _isEntireWord(content, startIdx, length) {
    if (startIdx > 0) {
      const first = content.charCodeAt(startIdx);
      const limit = content.charCodeAt(startIdx - 1);
      if (getCharacterType(first) === getCharacterType(limit)) {
        return false;
      }
    }
    const endIdx = (startIdx + length - 1);
    if (endIdx < (content.length - 1)) {
      const last = content.charCodeAt(endIdx);
      const limit = content.charCodeAt(endIdx + 1);
      if (getCharacterType(last) === getCharacterType(limit)) {
        return false;
      }
    }
    return true;
  }

  _calculatePhraseMatch(query, pageIndex, pageContent, entireWord) {
    const matches = [];
    const queryLen = query.length;

    let matchIdx = -queryLen;
    while (true) {
      matchIdx = pageContent.indexOf(query, matchIdx + queryLen);
      if (matchIdx === -1) {
        break;
      }
      if (entireWord && !this._isEntireWord(pageContent, matchIdx, queryLen)) {
        continue;
      }
      matches.push(matchIdx);
    }
    this._pageMatches[pageIndex] = matches;
  }

  _calculateWordMatch(query, pageIndex, pageContent, entireWord) {
    const matchesWithLength = [];

    // Divide the query into pieces and search for text in each piece.
    const queryArray = query.match(/\S+/g);
    for (let i = 0, len = queryArray.length; i < len; i++) {
      const subquery = queryArray[i];
      const subqueryLen = subquery.length;

      let matchIdx = -subqueryLen;
      while (true) {
        matchIdx = pageContent.indexOf(subquery, matchIdx + subqueryLen);
        if (matchIdx === -1) {
          break;
        }
        if (entireWord &&
            !this._isEntireWord(pageContent, matchIdx, subqueryLen)) {
          continue;
        }
        // Other searches do not, so we store the length.
        matchesWithLength.push({
          match: matchIdx,
          matchLength: subqueryLen,
          skipped: false,
        });
      }
    }

    // Prepare arrays for storing the matches.
    if (!this._pageMatchesLength) {
      this._pageMatchesLength = [];
    }
    this._pageMatchesLength[pageIndex] = [];
    this._pageMatches[pageIndex] = [];

    // Sort `matchesWithLength`, remove intersecting terms and put the result
    // into the two arrays.
    this._prepareMatches(matchesWithLength, this._pageMatches[pageIndex],
      this._pageMatchesLength[pageIndex]);
  }

  _calculateMatch(pageIndex) {
    let pageContent = this._normalize(this._pageContents[pageIndex]);
    let query = this._normalize(this._state.query);
    const { caseSensitive, entireWord, phraseSearch, } = this._state;

    if (query.length === 0) {
      // Do nothing: the matches should be wiped out already.
      return;
    }

    if (!caseSensitive) {
      pageContent = pageContent.toLowerCase();
      query = query.toLowerCase();
    }

    if (phraseSearch) {
      this._calculatePhraseMatch(query, pageIndex, pageContent, entireWord);
    } else {
      this._calculateWordMatch(query, pageIndex, pageContent, entireWord);
    }

    this._updatePage(pageIndex);
    if (this._resumePageIdx === pageIndex) {
      this._resumePageIdx = null;
      this._nextPageMatch();
    }

    // Update the match count.
    const pageMatchesCount = this._pageMatches[pageIndex].length;
    if (pageMatchesCount > 0) {
      this._matchesCountTotal += pageMatchesCount;
      this._updateUIResultsCount();
    }
  }

  _extractText() {
    // Perform text extraction once if this method is called multiple times.
    if (this._extractTextPromises.length > 0) {
      return;
    }

    let promise = Promise.resolve();
    for (let i = 0, ii = this._pdfViewer.pagesCount; i < ii; i++) {
      const extractTextCapability = createPromiseCapability();
      this._extractTextPromises[i] = extractTextCapability.promise;

      promise = promise.then(() => {
        return this._pdfDocument.getPage(i + 1).then((pdfPage) => {
          return pdfPage.getTextContent({
            normalizeWhitespace: true,
          });
        }).then((textContent) => {
          const textItems = textContent.items;
          const strBuf = [];

          for (let j = 0, jj = textItems.length; j < jj; j++) {
            strBuf.push(textItems[j].str);
          }

          // Store the page content (text items) as one string.
          this._pageContents[i] = strBuf.join('');
          extractTextCapability.resolve(i);
        }, (reason) => {
          console.error(`Unable to get text content for page ${i + 1}`, reason);
          // Page error -- assuming no text content.
          this._pageContents[i] = '';
          extractTextCapability.resolve(i);
        });
      });
    }
  }

  _updatePage(index) {
    if (this._selected.pageIdx === index) {
      // If the page is selected, scroll the page into view, which triggers
      // rendering the page, which adds the textLayer. Once the textLayer is
      // build, it will scroll onto the selected match.
      this._pdfViewer.currentPageNumber = index + 1;
    }

    this._eventBus.dispatch('updatetextlayermatches', {
      source: this,
      pageIndex: index,
    });
  }

  _nextMatch() {
    const previous = this._state.findPrevious;
    const currentPageIndex = this._pdfViewer.currentPageNumber - 1;
    const numPages = this._pdfViewer.pagesCount;

    this._highlightMatches = true;

    if (this._dirtyMatch) {
      // Need to recalculate the matches, reset everything.
      this._dirtyMatch = false;
      this._selected.pageIdx = this._selected.matchIdx = -1;
      this._offset.pageIdx = currentPageIndex;
      this._offset.matchIdx = null;
      this._resumePageIdx = null;
      this._pageMatches.length = 0;
      this._pageMatchesLength = null;
      this._matchesCountTotal = 0;

      for (let i = 0; i < numPages; i++) {
        // Wipe out any previously highlighted matches.
        this._updatePage(i);

        // Start finding the matches as soon as the text is extracted.
        if (!(i in this._pendingFindMatches)) {
          this._pendingFindMatches[i] = true;
          this._extractTextPromises[i].then((pageIdx) => {
            delete this._pendingFindMatches[pageIdx];
            this._calculateMatch(pageIdx);
          });
        }
      }
    }

    // If there's no query there's no point in searching.
    if (this._state.query === '') {
      this._updateUIState(FindState.FOUND);
      return;
    }

    // If we're waiting on a page, we return since we can't do anything else.
    if (this._resumePageIdx) {
      return;
    }

    const offset = this._offset;
    // Keep track of how many pages we should maximally iterate through.
    this._pagesToSearch = numPages;
    // If there's already a `matchIdx` that means we are iterating through a
    // page's matches.
    if (offset.matchIdx !== null) {
      const numPageMatches = this._pageMatches[offset.pageIdx].length;
      if ((!previous && offset.matchIdx + 1 < numPageMatches) ||
          (previous && offset.matchIdx > 0)) {
        // The simple case; we just have advance the matchIdx to select
        // the next match on the page.
        offset.matchIdx = (previous ? offset.matchIdx - 1 :
                                      offset.matchIdx + 1);
        this._updateMatch(/* found = */ true);
        return;
      }
      // We went beyond the current page's matches, so we advance to
      // the next page.
      this._advanceOffsetPage(previous);
    }
    // Start searching through the page.
    this._nextPageMatch();
  }

  _matchesReady(matches) {
    const offset = this._offset;
    const numMatches = matches.length;
    const previous = this._state.findPrevious;

    if (numMatches) {
      // There were matches for the page, so initialize `matchIdx`.
      offset.matchIdx = (previous ? numMatches - 1 : 0);
      this._updateMatch(/* found = */ true);
      return true;
    }
    // No matches, so attempt to search the next page.
    this._advanceOffsetPage(previous);
    if (offset.wrapped) {
      offset.matchIdx = null;
      if (this._pagesToSearch < 0) {
        // No point in wrapping again, there were no matches.
        this._updateMatch(/* found = */ false);
        // While matches were not found, searching for a page
        // with matches should nevertheless halt.
        return true;
      }
    }
    // Matches were not found (and searching is not done).
    return false;
  }

  _nextPageMatch() {
    if (this._resumePageIdx !== null) {
      console.error('There can only be one pending page.');
    }

    let matches = null;
    do {
      const pageIdx = this._offset.pageIdx;
      matches = this._pageMatches[pageIdx];
      if (!matches) {
        // The matches don't exist yet for processing by `_matchesReady`,
        // so set a resume point for when they do exist.
        this._resumePageIdx = pageIdx;
        break;
      }
    } while (!this._matchesReady(matches));
  }

  _advanceOffsetPage(previous) {
    const offset = this._offset;
    const numPages = this._extractTextPromises.length;
    offset.pageIdx = (previous ? offset.pageIdx - 1 : offset.pageIdx + 1);
    offset.matchIdx = null;

    this._pagesToSearch--;

    if (offset.pageIdx >= numPages || offset.pageIdx < 0) {
      offset.pageIdx = (previous ? numPages - 1 : 0);
      offset.wrapped = true;
    }
  }

  _updateMatch(found = false) {
    let state = FindState.NOT_FOUND;
    const wrapped = this._offset.wrapped;
    this._offset.wrapped = false;

    if (found) {
      const previousPage = this._selected.pageIdx;
      this.selected.pageIdx = this._offset.pageIdx;
      this.selected.matchIdx = this._offset.matchIdx;
      state = (wrapped ? FindState.WRAPPED : FindState.FOUND);

      // Update the currently selected page to wipe out any selected matches.
      if (previousPage !== -1 && previousPage !== this._selected.pageIdx) {
        this._updatePage(previousPage);
      }
    }

    this._updateUIState(state, this._state.findPrevious);
    if (this._selected.pageIdx !== -1) {
      this._updatePage(this._selected.pageIdx);
    }
  }

  _requestMatchesCount() {
    const { pageIdx, matchIdx, } = this._selected;
    let current = 0, total = this._matchesCountTotal;
    if (matchIdx !== -1) {
      for (let i = 0; i < pageIdx; i++) {
        current += (this._pageMatches[i] && this._pageMatches[i].length) || 0;
      }
      current += matchIdx + 1;
    }
    // When searching starts, this method may be called before the `pageMatches`
    // have been counted (in `_calculateMatch`). Ensure that the UI won't show
    // temporarily broken state when the active find result doesn't make sense.
    if (current < 1 || current > total) {
      current = total = 0;
    }
    return { current, total, };
  }

  _updateUIResultsCount() {
    if (!this.onUpdateResultsCount) {
      return;
    }
    const matchesCount = this._requestMatchesCount();
    this.onUpdateResultsCount(matchesCount);
  }

  _updateUIState(state, previous) {
    if (!this.onUpdateState) {
      return;
    }
    const matchesCount = this._requestMatchesCount();
    this.onUpdateState(state, previous, matchesCount);
  }
}

export {
  FindState,
  PDFFindController,
};