From 6d804d657f941df0a73f5a8b5758f00f71f6d740 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald <jonas.jenwald@gmail.com> Date: Sat, 1 Sep 2018 01:28:19 +0200 Subject: [PATCH] Add initial support for "Whole words" searching in the viewer As outlined in https://bugzilla.mozilla.org/show_bug.cgi?id=1282759 the internal Firefox name for the feature is `entireWord`, hence that name is used here as well for consistency (with "Whole words" being limited to the UI). Given existing limitations of the PDF.js search functionality, e.g. the existing problems of searching across "new lines", there's some edge-cases where "Whole words" searching will ignore (valid) results. However, considering that this is a pre-existing issue related to the way that the find controller joins text-content together, that shouldn't have to block this new feature in my opionion. *Please note:* In order to enable this feature in the `MOZCENTRAL` version, a small follow-up patch for [PdfjsChromeUtils.jsm](https://hg.mozilla.org/mozilla-central/file/tip/browser/extensions/pdfjs/content/PdfjsChromeUtils.jsm) will be required once this has landed in `mozilla-central`. --- l10n/en-US/viewer.properties | 1 + l10n/nl/viewer.properties | 1 + l10n/sv-SE/viewer.properties | 1 + test/unit/clitests.json | 1 + test/unit/jasmine-boot.js | 1 + test/unit/pdf_find_utils_spec.js | 56 ++++++++++++++++ web/app.js | 3 + web/firefoxcom.js | 8 ++- web/pdf_find_bar.js | 8 ++- web/pdf_find_controller.js | 40 ++++++++++-- web/pdf_find_utils.js | 107 +++++++++++++++++++++++++++++++ web/viewer.html | 8 ++- web/viewer.js | 1 + 13 files changed, 226 insertions(+), 10 deletions(-) create mode 100644 test/unit/pdf_find_utils_spec.js create mode 100644 web/pdf_find_utils.js diff --git a/l10n/en-US/viewer.properties b/l10n/en-US/viewer.properties index b110cf4e6..39f1a99bc 100644 --- a/l10n/en-US/viewer.properties +++ b/l10n/en-US/viewer.properties @@ -165,6 +165,7 @@ find_next.title=Find the next occurrence of the phrase find_next_label=Next find_highlight=Highlight all find_match_case_label=Match case +find_entire_word_label=Whole words find_reached_top=Reached top of document, continued from bottom find_reached_bottom=Reached end of document, continued from top # LOCALIZATION NOTE (find_matches_count): "{{current}}" and "{{total}}" will be diff --git a/l10n/nl/viewer.properties b/l10n/nl/viewer.properties index 61fb64f60..477d36645 100644 --- a/l10n/nl/viewer.properties +++ b/l10n/nl/viewer.properties @@ -165,6 +165,7 @@ find_next.title=De volgende overeenkomst van de tekst zoeken find_next_label=Volgende find_highlight=Alles markeren find_match_case_label=Hoofdlettergevoelig +find_entire_word_label=Hele woorden find_reached_top=Bovenkant van document bereikt, doorgegaan vanaf onderkant find_reached_bottom=Onderkant van document bereikt, doorgegaan vanaf bovenkant # LOCALIZATION NOTE (find_matches_count): "{{current}}" and "{{total}}" will be diff --git a/l10n/sv-SE/viewer.properties b/l10n/sv-SE/viewer.properties index eb5bbdf8c..3233913ed 100644 --- a/l10n/sv-SE/viewer.properties +++ b/l10n/sv-SE/viewer.properties @@ -165,6 +165,7 @@ find_next.title=Hitta nästa förekomst av frasen find_next_label=Nästa find_highlight=Markera alla find_match_case_label=Matcha versal/gemen +find_entire_word_label=Hela ord find_reached_top=Nådde början av dokumentet, började från slutet find_reached_bottom=Nådde slutet på dokumentet, började från början # LOCALIZATION NOTE (find_matches_count): "{{current}}" and "{{total}}" will be diff --git a/test/unit/clitests.json b/test/unit/clitests.json index 337109e7c..a0348fb7e 100644 --- a/test/unit/clitests.json +++ b/test/unit/clitests.json @@ -25,6 +25,7 @@ "network_utils_spec.js", "node_stream_spec.js", "parser_spec.js", + "pdf_find_utils.js", "pdf_history.js", "primitives_spec.js", "stream_spec.js", diff --git a/test/unit/jasmine-boot.js b/test/unit/jasmine-boot.js index 5297a56ee..ef87c76ee 100644 --- a/test/unit/jasmine-boot.js +++ b/test/unit/jasmine-boot.js @@ -67,6 +67,7 @@ function initializePDFJS(callback) { 'pdfjs-test/unit/network_spec', 'pdfjs-test/unit/network_utils_spec', 'pdfjs-test/unit/parser_spec', + 'pdfjs-test/unit/pdf_find_utils_spec', 'pdfjs-test/unit/pdf_history_spec', 'pdfjs-test/unit/primitives_spec', 'pdfjs-test/unit/stream_spec', diff --git a/test/unit/pdf_find_utils_spec.js b/test/unit/pdf_find_utils_spec.js new file mode 100644 index 000000000..9ec571517 --- /dev/null +++ b/test/unit/pdf_find_utils_spec.js @@ -0,0 +1,56 @@ +/* Copyright 2018 Mozilla Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import { CharacterType, getCharacterType } from '../../web/pdf_find_utils'; + +describe('pdf_find_utils', function() { + describe('getCharacterType', function() { + it('gets expected character types', function() { + const characters = { + 'A': CharacterType.ALPHA_LETTER, + 'a': CharacterType.ALPHA_LETTER, + '0': CharacterType.ALPHA_LETTER, + '5': CharacterType.ALPHA_LETTER, + '\xC4': CharacterType.ALPHA_LETTER, // 'Ä' + '\xE4': CharacterType.ALPHA_LETTER, // 'ä' + '_': CharacterType.ALPHA_LETTER, + ' ': CharacterType.SPACE, + '\t': CharacterType.SPACE, + '\r': CharacterType.SPACE, + '\n': CharacterType.SPACE, + '\xA0': CharacterType.SPACE, + '-': CharacterType.PUNCT, + ',': CharacterType.PUNCT, + '.': CharacterType.PUNCT, + ';': CharacterType.PUNCT, + ':': CharacterType.PUNCT, + '\u2122': CharacterType.ALPHA_LETTER, // trademark + '\u0E25': CharacterType.THAI_LETTER, + '\u4000': CharacterType.HAN_LETTER, + '\uF950': CharacterType.HAN_LETTER, + '\u30C0': CharacterType.KATAKANA_LETTER, + '\u3050': CharacterType.HIRAGANA_LETTER, + '\uFF80': CharacterType.HALFWIDTH_KATAKANA_LETTER, + }; + + for (const character in characters) { + const charCode = character.charCodeAt(0); + const type = characters[character]; + + expect(getCharacterType(charCode)).toEqual(type); + } + }); + }); +}); diff --git a/web/app.js b/web/app.js index ba1de89c4..1cc90cdb7 100644 --- a/web/app.js +++ b/web/app.js @@ -1959,6 +1959,7 @@ function webViewerFind(evt) { query: evt.query, phraseSearch: evt.phraseSearch, caseSensitive: evt.caseSensitive, + entireWord: evt.entireWord, highlightAll: evt.highlightAll, findPrevious: evt.findPrevious, }); @@ -1969,6 +1970,7 @@ function webViewerFindFromUrlHash(evt) { query: evt.query, phraseSearch: evt.phraseSearch, caseSensitive: false, + entireWord: false, highlightAll: true, findPrevious: false, }); @@ -2105,6 +2107,7 @@ function webViewerKeyDown(evt) { query: findState.query, phraseSearch: findState.phraseSearch, caseSensitive: findState.caseSensitive, + entireWord: findState.entireWord, highlightAll: findState.highlightAll, findPrevious: cmd === 5 || cmd === 12, }); diff --git a/web/firefoxcom.js b/web/firefoxcom.js index 80553cea9..46c75db6a 100644 --- a/web/firefoxcom.js +++ b/web/firefoxcom.js @@ -167,7 +167,8 @@ class MozL10n { 'find', 'findagain', 'findhighlightallchange', - 'findcasesensitivitychange' + 'findcasesensitivitychange', + 'findentirewordchange', ]; let handleEvent = function(evt) { if (!PDFViewerApplication.initialized) { @@ -179,13 +180,14 @@ class MozL10n { query: evt.detail.query, phraseSearch: true, caseSensitive: !!evt.detail.caseSensitive, + entireWord: !!evt.detail.entireWord, highlightAll: !!evt.detail.highlightAll, findPrevious: !!evt.detail.findPrevious, }); }; - for (let i = 0, len = events.length; i < len; i++) { - window.addEventListener(events[i], handleEvent); + for (let event of events) { + window.addEventListener(event, handleEvent); } })(); diff --git a/web/pdf_find_bar.js b/web/pdf_find_bar.js index 53e2c0201..795b725e7 100644 --- a/web/pdf_find_bar.js +++ b/web/pdf_find_bar.js @@ -33,6 +33,7 @@ class PDFFindBar { this.findField = options.findField || null; this.highlightAll = options.highlightAllCheckbox || null; this.caseSensitive = options.caseSensitiveCheckbox || null; + this.entireWord = options.entireWordCheckbox || null; this.findMsg = options.findMsg || null; this.findResultsCount = options.findResultsCount || null; this.findStatusIcon = options.findStatusIcon || null; @@ -85,6 +86,10 @@ class PDFFindBar { this.dispatchEvent('casesensitivitychange'); }); + this.entireWord.addEventListener('click', () => { + this.dispatchEvent('entirewordchange'); + }); + this.eventBus.on('resize', this._adjustWidth.bind(this)); } @@ -97,8 +102,9 @@ class PDFFindBar { source: this, type, query: this.findField.value, - caseSensitive: this.caseSensitive.checked, phraseSearch: true, + caseSensitive: this.caseSensitive.checked, + entireWord: this.entireWord.checked, highlightAll: this.highlightAll.checked, findPrevious: findPrev, }); diff --git a/web/pdf_find_controller.js b/web/pdf_find_controller.js index 2dcf82c6a..2f5e91911 100644 --- a/web/pdf_find_controller.js +++ b/web/pdf_find_controller.js @@ -14,6 +14,7 @@ */ import { createPromiseCapability } from 'pdfjs-lib'; +import { getCharacterType } from './pdf_find_utils'; import { getGlobalEventBus } from './dom_events'; import { scrollIntoView } from './ui_utils'; @@ -190,7 +191,30 @@ class PDFFindController { } } - _calculatePhraseMatch(query, pageIndex, pageContent) { + /** + * Determine if the search query constitutes a "whole word", by comparing the + * first/last character type with the preceding/following character type. + */ + _isEntireWord(content, startIdx, length) { + if (startIdx > 0) { + const first = content.charCodeAt(startIdx); + const limit = content.charCodeAt(startIdx - 1); + if (getCharacterType(first) === getCharacterType(limit)) { + return false; + } + } + const endIdx = (startIdx + length - 1); + if (endIdx < (content.length - 1)) { + const last = content.charCodeAt(endIdx); + const limit = content.charCodeAt(endIdx + 1); + if (getCharacterType(last) === getCharacterType(limit)) { + return false; + } + } + return true; + } + + _calculatePhraseMatch(query, pageIndex, pageContent, entireWord) { let matches = []; let queryLen = query.length; let matchIdx = -queryLen; @@ -199,12 +223,15 @@ class PDFFindController { if (matchIdx === -1) { break; } + if (entireWord && !this._isEntireWord(pageContent, matchIdx, queryLen)) { + continue; + } matches.push(matchIdx); } this.pageMatches[pageIndex] = matches; } - _calculateWordMatch(query, pageIndex, pageContent) { + _calculateWordMatch(query, pageIndex, pageContent, entireWord) { let matchesWithLength = []; // Divide the query into pieces and search for text in each piece. let queryArray = query.match(/\S+/g); @@ -217,6 +244,10 @@ class PDFFindController { if (matchIdx === -1) { break; } + if (entireWord && + !this._isEntireWord(pageContent, matchIdx, subqueryLen)) { + continue; + } // Other searches do not, so we store the length. matchesWithLength.push({ match: matchIdx, @@ -244,6 +275,7 @@ class PDFFindController { let query = this._normalize(this.state.query); let caseSensitive = this.state.caseSensitive; let phraseSearch = this.state.phraseSearch; + const entireWord = this.state.entireWord; let queryLen = query.length; if (queryLen === 0) { @@ -257,9 +289,9 @@ class PDFFindController { } if (phraseSearch) { - this._calculatePhraseMatch(query, pageIndex, pageContent); + this._calculatePhraseMatch(query, pageIndex, pageContent, entireWord); } else { - this._calculateWordMatch(query, pageIndex, pageContent); + this._calculateWordMatch(query, pageIndex, pageContent, entireWord); } this._updatePage(pageIndex); diff --git a/web/pdf_find_utils.js b/web/pdf_find_utils.js new file mode 100644 index 000000000..419684a5e --- /dev/null +++ b/web/pdf_find_utils.js @@ -0,0 +1,107 @@ +/* Copyright 2018 Mozilla Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +const CharacterType = { + SPACE: 0, + ALPHA_LETTER: 1, + PUNCT: 2, + HAN_LETTER: 3, + KATAKANA_LETTER: 4, + HIRAGANA_LETTER: 5, + HALFWIDTH_KATAKANA_LETTER: 6, + THAI_LETTER: 7, +}; + +function isAlphabeticalScript(charCode) { + return charCode < 0x2E80; +} + +function isAscii(charCode) { + return (charCode & 0xFF80) === 0; +} + +function isAsciiAlpha(charCode) { + return (charCode >= /* a = */ 0x61 && charCode <= /* z = */ 0x7A) || + (charCode >= /* A = */ 0x41 && charCode <= /* Z = */ 0x5A); +} + +function isAsciiDigit(charCode) { + return (charCode >= /* 0 = */ 0x30 && charCode <= /* 9 = */ 0x39); +} + +function isAsciiSpace(charCode) { + return (charCode === /* SPACE = */ 0x20 || charCode === /* TAB = */ 0x09 || + charCode === /* CR = */ 0x0D || charCode === /* LF = */ 0x0A); +} + +function isHan(charCode) { + return (charCode >= 0x3400 && charCode <= 0x9FFF) || + (charCode >= 0xF900 && charCode <= 0xFAFF); +} + +function isKatakana(charCode) { + return (charCode >= 0x30A0 && charCode <= 0x30FF); +} + +function isHiragana(charCode) { + return (charCode >= 0x3040 && charCode <= 0x309F); +} + +function isHalfwidthKatakana(charCode) { + return (charCode >= 0xFF60 && charCode <= 0xFF9F); +} + +function isThai(charCode) { + return (charCode & 0xFF80) === 0x0E00; +} + +/** + * This function is based on the word-break detection implemented in: + * https://hg.mozilla.org/mozilla-central/file/tip/intl/lwbrk/WordBreaker.cpp + */ +function getCharacterType(charCode) { + if (isAlphabeticalScript(charCode)) { + if (isAscii(charCode)) { + if (isAsciiSpace(charCode)) { + return CharacterType.SPACE; + } else if (isAsciiAlpha(charCode) || isAsciiDigit(charCode) || + charCode === /* UNDERSCORE = */ 0x5F) { + return CharacterType.ALPHA_LETTER; + } + return CharacterType.PUNCT; + } else if (isThai(charCode)) { + return CharacterType.THAI_LETTER; + } else if (charCode === /* NBSP = */ 0xA0) { + return CharacterType.SPACE; + } + return CharacterType.ALPHA_LETTER; + } + + if (isHan(charCode)) { + return CharacterType.HAN_LETTER; + } else if (isKatakana(charCode)) { + return CharacterType.KATAKANA_LETTER; + } else if (isHiragana(charCode)) { + return CharacterType.HIRAGANA_LETTER; + } else if (isHalfwidthKatakana(charCode)) { + return CharacterType.HALFWIDTH_KATAKANA_LETTER; + } + return CharacterType.ALPHA_LETTER; +} + +export { + CharacterType, + getCharacterType, +}; diff --git a/web/viewer.html b/web/viewer.html index 32469d113..f2a284108 100644 --- a/web/viewer.html +++ b/web/viewer.html @@ -104,15 +104,19 @@ See https://github.com/adobe-type-tools/cmap-resources </div> </div> - <div id="findbarOptionsContainer"> + <div id="findbarOptionsOneContainer"> <input type="checkbox" id="findHighlightAll" class="toolbarField" tabindex="94"> <label for="findHighlightAll" class="toolbarLabel" data-l10n-id="find_highlight">Highlight all</label> <input type="checkbox" id="findMatchCase" class="toolbarField" tabindex="95"> <label for="findMatchCase" class="toolbarLabel" data-l10n-id="find_match_case_label">Match case</label> </div> + <div id="findbarOptionsTwoContainer"> + <input type="checkbox" id="findEntireWord" class="toolbarField" tabindex="96"> + <label for="findEntireWord" class="toolbarLabel" data-l10n-id="find_entire_word_label">Whole words</label> + <span id="findResultsCount" class="toolbarLabel hidden"></span> + </div> <div id="findbarMessageContainer"> - <span id="findResultsCount" class="toolbarLabel hidden"></span> <span id="findMsg" class="toolbarLabel"></span> </div> </div> <!-- findbar --> diff --git a/web/viewer.js b/web/viewer.js index 7a9ae61c0..113b2799e 100644 --- a/web/viewer.js +++ b/web/viewer.js @@ -134,6 +134,7 @@ function getViewerConfiguration() { findField: document.getElementById('findInput'), highlightAllCheckbox: document.getElementById('findHighlightAll'), caseSensitiveCheckbox: document.getElementById('findMatchCase'), + entireWordCheckbox: document.getElementById('findEntireWord'), findMsg: document.getElementById('findMsg'), findResultsCount: document.getElementById('findResultsCount'), findStatusIcon: document.getElementById('findStatusIcon'),