pdf.js/web/pdf_find_controller.js

/* Copyright 2012 Mozilla Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import { createPromiseCapability } from 'pdfjs-lib';
import { getGlobalEventBus } from './dom_events';
import { scrollIntoView } from './ui_utils';

const FindState = {
  FOUND: 0,
  NOT_FOUND: 1,
  WRAPPED: 2,
  PENDING: 3,
};

const FIND_SCROLL_OFFSET_TOP = -50;
const FIND_SCROLL_OFFSET_LEFT = -400;
const FIND_TIMEOUT = 250; // ms

const CHARACTERS_TO_NORMALIZE = {
  '\u2018': '\'', // Left single quotation mark
  '\u2019': '\'', // Right single quotation mark
  '\u201A': '\'', // Single low-9 quotation mark
  '\u201B': '\'', // Single high-reversed-9 quotation mark
  '\u201C': '"', // Left double quotation mark
  '\u201D': '"', // Right double quotation mark
  '\u201E': '"', // Double low-9 quotation mark
  '\u201F': '"', // Double high-reversed-9 quotation mark
  '\u00BC': '1/4', // Vulgar fraction one quarter
  '\u00BD': '1/2', // Vulgar fraction one half
  '\u00BE': '3/4', // Vulgar fraction three quarters
};

/**
 * Provides search functionality to find a given string in a PDF document.
 */
class PDFFindController {
  constructor({ pdfViewer, eventBus = getGlobalEventBus(), }) {
    this.pdfViewer = pdfViewer;
    this.eventBus = eventBus;

    this.onUpdateResultsCount = null;
    this.onUpdateState = null;

    this.reset();

    // Compile the regular expression for text normalization once.
    let replace = Object.keys(CHARACTERS_TO_NORMALIZE).join('');
    this.normalizationRegex = new RegExp('[' + replace + ']', 'g');
  }

  reset() {
    this.startedTextExtraction = false;
    this.extractTextPromises = [];
    this.pendingFindMatches = Object.create(null);
    this.active = false; // If active, find results will be highlighted.
    this.pageContents = []; // Stores the text for each page.
    this.pageMatches = [];
    this.pageMatchesLength = null;
    this.matchCount = 0;
    this.selected = { // Currently selected match.
      pageIdx: -1,
      matchIdx: -1,
    };
    this.offset = { // Where the find algorithm currently is in the document.
      pageIdx: null,
      matchIdx: null,
    };
    this.pagesToSearch = null;
    this.resumePageIdx = null;
    this.state = null;
    this.dirtyMatch = false;
    this.findTimeout = null;

    this._firstPagePromise = new Promise((resolve) => {
      const eventBus = this.eventBus;
      eventBus.on('pagesinit', function onPagesInit() {
        eventBus.off('pagesinit', onPagesInit);
        resolve();
      });
    });
  }

  executeCommand(cmd, state) {
    if (this.state === null || cmd !== 'findagain') {
      this.dirtyMatch = true;
    }
    this.state = state;
    this._updateUIState(FindState.PENDING);

    this._firstPagePromise.then(() => {
      this._extractText();

      clearTimeout(this.findTimeout);
      if (cmd === 'find') {
        // Trigger the find action with a small delay to avoid starting the
        // search when the user is still typing (saving resources).
        this.findTimeout = setTimeout(this._nextMatch.bind(this), FIND_TIMEOUT);
      } else {
        this._nextMatch();
      }
    });
  }

  /**
   * Called from the text layer when match presentation is updated.
   *
   * @param {number} pageIndex - The index of the page.
   * @param {number} matchIndex - The index of the match.
   * @param {Array} elements - Text layer `div` elements.
   * @param {number} beginIdx - Start index of the `div` array for the match.
   */
  updateMatchPosition(pageIndex, matchIndex, elements, beginIdx) {
    if (this.selected.matchIdx === matchIndex &&
        this.selected.pageIdx === pageIndex) {
      let spot = {
        top: FIND_SCROLL_OFFSET_TOP,
        left: FIND_SCROLL_OFFSET_LEFT,
      };
      scrollIntoView(elements[beginIdx], spot,
                     /* skipOverflowHiddenElements = */ true);
    }
  }

  _normalize(text) {
    return text.replace(this.normalizationRegex, function (ch) {
      return CHARACTERS_TO_NORMALIZE[ch];
    });
  }

  /**
   * Helper for multi-term search that fills the `matchesWithLength` array
   * and handles cases where one search term includes another search term (for
   * example, "tamed tame" or "this is"). It looks for intersecting terms in
   * the `matches` and keeps elements with a longer match length.
   */
  _prepareMatches(matchesWithLength, matches, matchesLength) {
    function isSubTerm(matchesWithLength, currentIndex) {
      let currentElem = matchesWithLength[currentIndex];
      let nextElem = matchesWithLength[currentIndex + 1];

      // Check for cases like "TAMEd TAME".
      if (currentIndex < matchesWithLength.length - 1 &&
          currentElem.match === nextElem.match) {
        currentElem.skipped = true;
        return true;
      }

      // Check for cases like "thIS IS".
      for (let i = currentIndex - 1; i >= 0; i--) {
        let prevElem = matchesWithLength[i];
        if (prevElem.skipped) {
          continue;
        }
        if (prevElem.match + prevElem.matchLength < currentElem.match) {
          break;
        }
        if (prevElem.match + prevElem.matchLength >=
            currentElem.match + currentElem.matchLength) {
          currentElem.skipped = true;
          return true;
        }
      }
      return false;
    }

    // Sort the array of `{ match: <match>, matchLength: <matchLength> }`
    // objects on increasing index first and on the length otherwise.
    matchesWithLength.sort(function(a, b) {
      return a.match === b.match ? a.matchLength - b.matchLength :
                                   a.match - b.match;
    });
    for (let i = 0, len = matchesWithLength.length; i < len; i++) {
      if (isSubTerm(matchesWithLength, i)) {
        continue;
      }
      matches.push(matchesWithLength[i].match);
      matchesLength.push(matchesWithLength[i].matchLength);
    }
  }

  _calculatePhraseMatch(query, pageIndex, pageContent) {
    let matches = [];
    let queryLen = query.length;
    let matchIdx = -queryLen;
    while (true) {
      matchIdx = pageContent.indexOf(query, matchIdx + queryLen);
      if (matchIdx === -1) {
        break;
      }
      matches.push(matchIdx);
    }
    this.pageMatches[pageIndex] = matches;
  }

  _calculateWordMatch(query, pageIndex, pageContent) {
    let matchesWithLength = [];
    // Divide the query into pieces and search for text in each piece.
    let queryArray = query.match(/\S+/g);
    for (let i = 0, len = queryArray.length; i < len; i++) {
      let subquery = queryArray[i];
      let subqueryLen = subquery.length;
      let matchIdx = -subqueryLen;
      while (true) {
        matchIdx = pageContent.indexOf(subquery, matchIdx + subqueryLen);
        if (matchIdx === -1) {
          break;
        }
        // Other searches do not, so we store the length.
        matchesWithLength.push({
          match: matchIdx,
          matchLength: subqueryLen,
          skipped: false,
        });
      }
    }

    // Prepare arrays for storing the matches.
    if (!this.pageMatchesLength) {
      this.pageMatchesLength = [];
    }
    this.pageMatchesLength[pageIndex] = [];
    this.pageMatches[pageIndex] = [];

    // Sort `matchesWithLength`, remove intersecting terms and put the result
    // into the two arrays.
    this._prepareMatches(matchesWithLength, this.pageMatches[pageIndex],
      this.pageMatchesLength[pageIndex]);
  }

  _calculateMatch(pageIndex) {
    let pageContent = this._normalize(this.pageContents[pageIndex]);
    let query = this._normalize(this.state.query);
    let caseSensitive = this.state.caseSensitive;
    let phraseSearch = this.state.phraseSearch;
    let queryLen = query.length;

    if (queryLen === 0) {
      // Do nothing: the matches should be wiped out already.
      return;
    }

    if (!caseSensitive) {
      pageContent = pageContent.toLowerCase();
      query = query.toLowerCase();
    }

    if (phraseSearch) {
      this._calculatePhraseMatch(query, pageIndex, pageContent);
    } else {
      this._calculateWordMatch(query, pageIndex, pageContent);
    }

    this._updatePage(pageIndex);
    if (this.resumePageIdx === pageIndex) {
      this.resumePageIdx = null;
      this._nextPageMatch();
    }

    // Update the match count.
    if (this.pageMatches[pageIndex].length > 0) {
      this.matchCount += this.pageMatches[pageIndex].length;
      this._updateUIResultsCount();
    }
  }

  _extractText() {
    if (this.startedTextExtraction) {
      return;
    }
    this.startedTextExtraction = true;
    this.pageContents.length = 0;

    let promise = Promise.resolve();
    for (let i = 0, ii = this.pdfViewer.pagesCount; i < ii; i++) {
      let extractTextCapability = createPromiseCapability();
      this.extractTextPromises[i] = extractTextCapability.promise;

      promise = promise.then(() => {
        return this.pdfViewer.getPageTextContent(i).then((textContent) => {
          let textItems = textContent.items;
          let strBuf = [];

          for (let j = 0, jj = textItems.length; j < jj; j++) {
            strBuf.push(textItems[j].str);
          }
          // Store the pageContent as a string.
          this.pageContents[i] = strBuf.join('');
          extractTextCapability.resolve(i);
        }, (reason) => {
          console.error(`Unable to get page ${i + 1} text content`, reason);
          // Page error -- assuming no text content.
          this.pageContents[i] = '';
          extractTextCapability.resolve(i);
        });
      });
    }
  }

  _updatePage(index) {
    if (this.selected.pageIdx === index) {
      // If the page is selected, scroll the page into view, which triggers
      // rendering the page, which adds the textLayer. Once the textLayer is
      // build, it will scroll onto the selected match.
      this.pdfViewer.currentPageNumber = index + 1;
    }

    let page = this.pdfViewer.getPageView(index);
    if (page.textLayer) {
      page.textLayer.updateMatches();
    }
  }

  _nextMatch() {
    let previous = this.state.findPrevious;
    let currentPageIndex = this.pdfViewer.currentPageNumber - 1;
    let numPages = this.pdfViewer.pagesCount;

    this.active = true;

    if (this.dirtyMatch) {
      // Need to recalculate the matches, reset everything.
      this.dirtyMatch = false;
      this.selected.pageIdx = this.selected.matchIdx = -1;
      this.offset.pageIdx = currentPageIndex;
      this.offset.matchIdx = null;
      this.hadMatch = false;
      this.resumePageIdx = null;
      this.pageMatches = [];
      this.matchCount = 0;
      this.pageMatchesLength = null;

      for (let i = 0; i < numPages; i++) {
        // Wipe out any previously highlighted matches.
        this._updatePage(i);

        // Start finding the matches as soon as the text is extracted.
        if (!(i in this.pendingFindMatches)) {
          this.pendingFindMatches[i] = true;
          this.extractTextPromises[i].then((pageIdx) => {
            delete this.pendingFindMatches[pageIdx];
            this._calculateMatch(pageIdx);
          });
        }
      }
    }

    // If there's no query there's no point in searching.
    if (this.state.query === '') {
      this._updateUIState(FindState.FOUND);
      return;
    }

    // If we're waiting on a page, we return since we can't do anything else.
    if (this.resumePageIdx) {
      return;
    }

    let offset = this.offset;
    // Keep track of how many pages we should maximally iterate through.
    this.pagesToSearch = numPages;
    // If there's already a `matchIdx` that means we are iterating through a
    // page's matches.
    if (offset.matchIdx !== null) {
      let numPageMatches = this.pageMatches[offset.pageIdx].length;
      if ((!previous && offset.matchIdx + 1 < numPageMatches) ||
          (previous && offset.matchIdx > 0)) {
        // The simple case; we just have advance the matchIdx to select
        // the next match on the page.
        this.hadMatch = true;
        offset.matchIdx = (previous ? offset.matchIdx - 1 :
                                      offset.matchIdx + 1);
        this._updateMatch(/* found = */ true);
        return;
      }
      // We went beyond the current page's matches, so we advance to
      // the next page.
      this._advanceOffsetPage(previous);
    }
    // Start searching through the page.
    this._nextPageMatch();
  }

  _matchesReady(matches) {
    let offset = this.offset;
    let numMatches = matches.length;
    let previous = this.state.findPrevious;

    if (numMatches) {
      // There were matches for the page, so initialize `matchIdx`.
      this.hadMatch = true;
      offset.matchIdx = (previous ? numMatches - 1 : 0);
      this._updateMatch(/* found = */ true);
      return true;
    }
    // No matches, so attempt to search the next page.
    this._advanceOffsetPage(previous);
    if (offset.wrapped) {
      offset.matchIdx = null;
      if (this.pagesToSearch < 0) {
        // No point in wrapping again, there were no matches.
        this._updateMatch(/* found = */ false);
        // While matches were not found, searching for a page
        // with matches should nevertheless halt.
        return true;
      }
    }
    // Matches were not found (and searching is not done).
    return false;
  }

  _nextPageMatch() {
    if (this.resumePageIdx !== null) {
      console.error('There can only be one pending page.');
    }

    let matches = null;
    do {
      let pageIdx = this.offset.pageIdx;
      matches = this.pageMatches[pageIdx];
      if (!matches) {
        // The matches don't exist yet for processing by `_matchesReady`,
        // so set a resume point for when they do exist.
        this.resumePageIdx = pageIdx;
        break;
      }
    } while (!this._matchesReady(matches));
  }

  _advanceOffsetPage(previous) {
    let offset = this.offset;
    let numPages = this.extractTextPromises.length;
    offset.pageIdx = (previous ? offset.pageIdx - 1 : offset.pageIdx + 1);
    offset.matchIdx = null;

    this.pagesToSearch--;

    if (offset.pageIdx >= numPages || offset.pageIdx < 0) {
      offset.pageIdx = (previous ? numPages - 1 : 0);
      offset.wrapped = true;
    }
  }

  _updateMatch(found = false) {
    let state = FindState.NOT_FOUND;
    let wrapped = this.offset.wrapped;
    this.offset.wrapped = false;

    if (found) {
      let previousPage = this.selected.pageIdx;
      this.selected.pageIdx = this.offset.pageIdx;
      this.selected.matchIdx = this.offset.matchIdx;
      state = (wrapped ? FindState.WRAPPED : FindState.FOUND);

      // Update the currently selected page to wipe out any selected matches.
      if (previousPage !== -1 && previousPage !== this.selected.pageIdx) {
        this._updatePage(previousPage);
      }
    }

    this._updateUIState(state, this.state.findPrevious);
    if (this.selected.pageIdx !== -1) {
      this._updatePage(this.selected.pageIdx);
    }
  }

  _updateUIResultsCount() {
    if (this.onUpdateResultsCount) {
      this.onUpdateResultsCount(this.matchCount);
    }
  }

  _updateUIState(state, previous) {
    if (this.onUpdateState) {
      this.onUpdateState(state, previous, this.matchCount);
    }
  }
}

export {
  FindState,
  PDFFindController,
};