diff --git a/l10n/en-US/viewer.properties b/l10n/en-US/viewer.properties index dc54ed39d..6f5a77fe5 100644 --- a/l10n/en-US/viewer.properties +++ b/l10n/en-US/viewer.properties @@ -170,6 +170,7 @@ find_next.title=Find the next occurrence of the phrase find_next_label=Next find_highlight=Highlight all find_match_case_label=Match case +find_match_diacritics_label=Match Diacritics find_entire_word_label=Whole words find_reached_top=Reached top of document, continued from bottom find_reached_bottom=Reached end of document, continued from top diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index d1d37ef26..2e8519f82 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -357,6 +357,7 @@ !issue4650.pdf !issue6721_reduced.pdf !issue3025.pdf +!french_diacritics.pdf !issue2099-1.pdf !issue3371.pdf !issue2956.pdf diff --git a/test/pdfs/french_diacritics.pdf b/test/pdfs/french_diacritics.pdf new file mode 100644 index 000000000..ba5b5cb07 Binary files /dev/null and b/test/pdfs/french_diacritics.pdf differ diff --git a/test/unit/pdf_find_controller_spec.js b/test/unit/pdf_find_controller_spec.js index 8c6453aff..ee1c53a9b 100644 --- a/test/unit/pdf_find_controller_spec.js +++ b/test/unit/pdf_find_controller_spec.js @@ -86,6 +86,7 @@ function testSearch({ entireWord: false, phraseSearch: true, findPrevious: false, + matchDiacritics: false, }, state ); @@ -264,5 +265,293 @@ describe("pdf_find_controller", function () { pageMatches: [[19, 46, 62]], pageMatchesLength: [[8, 8, 8]], }); + + await testSearch({ + eventBus, + pdfFindController, + state: { + query: "1/2", + }, + matchesPerPage: [2], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + pageMatches: [[27, 54]], + pageMatchesLength: [[1, 1]], + }); + + await testSearch({ + eventBus, + pdfFindController, + state: { + query: "½", + }, + matchesPerPage: [2], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + pageMatches: [[27, 54]], + pageMatchesLength: [[1, 1]], + }); + }); + + it("performs a normal search, where the text with diacritics is normalized", async function () { + const { eventBus, pdfFindController } = await initPdfFindController( + "french_diacritics.pdf" + ); + + await testSearch({ + eventBus, + pdfFindController, + state: { + query: "a", + }, + matchesPerPage: [6], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + pageMatches: [[0, 2, 4, 6, 8, 10]], + pageMatchesLength: [[1, 1, 1, 1, 1, 1]], + }); + + await testSearch({ + eventBus, + pdfFindController, + state: { + query: "u", + }, + matchesPerPage: [6], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + pageMatches: [[44, 46, 48, 50, 52, 54]], + pageMatchesLength: [[1, 1, 1, 1, 1, 1]], + }); + + await testSearch({ + eventBus, + pdfFindController, + state: { + query: "ë", + matchDiacritics: true, + }, + matchesPerPage: [2], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + pageMatches: [[28, 30]], + pageMatchesLength: [[1, 1]], + }); + }); + + it("performs a search where one of the results contains an hyphen", async function () { + const { eventBus, pdfFindController } = await initPdfFindController(); + + await testSearch({ + eventBus, + pdfFindController, + state: { + query: "optimiz", + }, + matchesPerPage: [1, 4, 2, 3, 3, 0, 2, 9, 1, 0, 0, 6, 3, 4], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + }); + }); + + it("performs a search where the result is on two lines", async function () { + const { eventBus, pdfFindController } = await initPdfFindController(); + + await testSearch({ + eventBus, + pdfFindController, + state: { + query: "user experience", + }, + matchesPerPage: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + pageMatches: [[2743]], + pageMatchesLength: [[14]], + }); + }); + + it("performs a search where the result is on two lines with a punctuation at eol", async function () { + const { eventBus, pdfFindController } = await initPdfFindController(); + + await testSearch({ + eventBus, + pdfFindController, + state: { + query: "version.the", + }, + matchesPerPage: [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + selectedMatch: { + pageIndex: 1, + matchIndex: 0, + }, + pageMatches: [[], [1493]], + pageMatchesLength: [[], [11]], + }); + }); + + it("performs a search with a minus sign in the query", async function () { + const { eventBus, pdfFindController } = await initPdfFindController(); + + await testSearch({ + eventBus, + pdfFindController, + state: { + query: "trace-based just-in-time", + }, + matchesPerPage: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + pageMatches: [ + [0], + [], + [], + [], + [], + [], + [], + [], + [], + [], + [], + [], + [], + [2087], + ], + pageMatchesLength: [ + [24], + [], + [], + [], + [], + [], + [], + [], + [], + [], + [], + [], + [], + [24], + ], + }); + }); + + it("performs a search with square brackets in the query", async function () { + const { eventBus, pdfFindController } = await initPdfFindController(); + + await testSearch({ + eventBus, + pdfFindController, + state: { + query: "[Programming Languages]", + }, + matchesPerPage: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + pageMatches: [[1501]], + pageMatchesLength: [[25]], + }); + }); + + it("performs a search with parenthesis in the query", async function () { + const { eventBus, pdfFindController } = await initPdfFindController(); + + await testSearch({ + eventBus, + pdfFindController, + state: { + query: "\t (checks)", + }, + matchesPerPage: [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + selectedMatch: { + pageIndex: 1, + matchIndex: 0, + }, + pageMatches: [[], [201]], + pageMatchesLength: [[], [9]], + }); + }); + + it("performs a search with a final dot in the query", async function () { + const { eventBus, pdfFindController } = await initPdfFindController(); + + // The whitespace after the dot mustn't be matched. + const query = "complex applications."; + + await testSearch({ + eventBus, + pdfFindController, + state: { + query, + }, + matchesPerPage: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + pageMatches: [[1946]], + pageMatchesLength: [[21]], + }); + }); + + it("performs a search with a dot in the query and a missing whitespace", async function () { + const { eventBus, pdfFindController } = await initPdfFindController(); + + // The whitespace after the dot must be matched. + const query = "complex applications.J"; + + await testSearch({ + eventBus, + pdfFindController, + state: { + query, + }, + matchesPerPage: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + pageMatches: [[1946]], + pageMatchesLength: [[23]], + }); + }); + + it("performs a search with a dot followed by a whitespace in the query", async function () { + const { eventBus, pdfFindController } = await initPdfFindController(); + const query = "complex applications. j"; + + await testSearch({ + eventBus, + pdfFindController, + state: { + query, + }, + matchesPerPage: [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + selectedMatch: { + pageIndex: 0, + matchIndex: 0, + }, + pageMatches: [[1946]], + pageMatchesLength: [[23]], + }); }); }); diff --git a/test/unit/ui_utils_spec.js b/test/unit/ui_utils_spec.js index c83e2f056..c98c6d7ae 100644 --- a/test/unit/ui_utils_spec.js +++ b/test/unit/ui_utils_spec.js @@ -51,6 +51,11 @@ describe("ui_utils", function () { expect(binarySearchFirstItem([2, 3, 4], isGreater3)).toEqual(2); expect(binarySearchFirstItem([4, 5, 6], isGreater3)).toEqual(0); }); + it("three numeric entries and a start index", function () { + expect(binarySearchFirstItem([0, 1, 2, 3, 4], isGreater3, 2)).toEqual(4); + expect(binarySearchFirstItem([2, 3, 4], isGreater3, 2)).toEqual(2); + expect(binarySearchFirstItem([4, 5, 6], isGreater3, 1)).toEqual(1); + }); }); describe("isValidRotation", function () { diff --git a/web/app.js b/web/app.js index 0e879e1e5..6543eba70 100644 --- a/web/app.js +++ b/web/app.js @@ -2592,6 +2592,7 @@ function webViewerFindFromUrlHash(evt) { entireWord: false, highlightAll: true, findPrevious: false, + matchDiacritics: true, }); } diff --git a/web/firefoxcom.js b/web/firefoxcom.js index 0344f4cfb..ea64b5e88 100644 --- a/web/firefoxcom.js +++ b/web/firefoxcom.js @@ -218,6 +218,7 @@ class MozL10n { "findcasesensitivitychange", "findentirewordchange", "findbarclose", + "finddiacriticmatchingchange", ]; const findLen = "find".length; @@ -238,6 +239,7 @@ class MozL10n { entireWord: !!detail.entireWord, highlightAll: !!detail.highlightAll, findPrevious: !!detail.findPrevious, + matchDiacritics: !!detail.matchDiacritics, }); }; diff --git a/web/pdf_find_bar.js b/web/pdf_find_bar.js index 3c0192566..54fda4ddb 100644 --- a/web/pdf_find_bar.js +++ b/web/pdf_find_bar.js @@ -32,6 +32,7 @@ class PDFFindBar { this.findField = options.findField; this.highlightAll = options.highlightAllCheckbox; this.caseSensitive = options.caseSensitiveCheckbox; + this.matchDiacritics = options.matchDiacriticsCheckbox; this.entireWord = options.entireWordCheckbox; this.findMsg = options.findMsg; this.findResultsCount = options.findResultsCount; @@ -82,6 +83,10 @@ class PDFFindBar { this.dispatchEvent("entirewordchange"); }); + this.matchDiacritics.addEventListener("click", () => { + this.dispatchEvent("diacriticmatchingchange"); + }); + this.eventBus._on("resize", this._adjustWidth.bind(this)); } @@ -99,6 +104,7 @@ class PDFFindBar { entireWord: this.entireWord.checked, highlightAll: this.highlightAll.checked, findPrevious: findPrev, + matchDiacritics: this.matchDiacritics.checked, }); } diff --git a/web/pdf_find_controller.js b/web/pdf_find_controller.js index 49677e15f..218c9ae98 100644 --- a/web/pdf_find_controller.js +++ b/web/pdf_find_controller.js @@ -17,9 +17,9 @@ /** @typedef {import("./event_utils").EventBus} EventBus */ /** @typedef {import("./interfaces").IPDFLinkService} IPDFLinkService */ +import { binarySearchFirstItem, scrollIntoView } from "./ui_utils.js"; import { createPromiseCapability } from "pdfjs-lib"; import { getCharacterType } from "./pdf_find_utils.js"; -import { scrollIntoView } from "./ui_utils.js"; const FindState = { FOUND: 0, @@ -47,47 +47,184 @@ const CHARACTERS_TO_NORMALIZE = { "\u00BE": "3/4", // Vulgar fraction three quarters }; +// These diacritics aren't considered as combining diacritics +// when searching in a document: +// https://searchfox.org/mozilla-central/source/intl/unicharutil/util/is_combining_diacritic.py. +// The combining class definitions can be found: +// https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values +// Category 0 corresponds to [^\p{Mn}]. +const DIACRITICS_EXCEPTION = new Set([ + // UNICODE_COMBINING_CLASS_KANA_VOICING + // https://www.compart.com/fr/unicode/combining/8 + 0x3099, 0x309a, + // UNICODE_COMBINING_CLASS_VIRAMA (under 0xFFFF) + // https://www.compart.com/fr/unicode/combining/9 + 0x094d, 0x09cd, 0x0a4d, 0x0acd, 0x0b4d, 0x0bcd, 0x0c4d, 0x0ccd, 0x0d3b, + 0x0d3c, 0x0d4d, 0x0dca, 0x0e3a, 0x0eba, 0x0f84, 0x1039, 0x103a, 0x1714, + 0x1734, 0x17d2, 0x1a60, 0x1b44, 0x1baa, 0x1bab, 0x1bf2, 0x1bf3, 0x2d7f, + 0xa806, 0xa82c, 0xa8c4, 0xa953, 0xa9c0, 0xaaf6, 0xabed, + // 91 + // https://www.compart.com/fr/unicode/combining/91 + 0x0c56, + // 129 + // https://www.compart.com/fr/unicode/combining/129 + 0x0f71, + // 130 + // https://www.compart.com/fr/unicode/combining/130 + 0x0f72, 0x0f7a, 0x0f7b, 0x0f7c, 0x0f7d, 0x0f80, + // 132 + // https://www.compart.com/fr/unicode/combining/132 + 0x0f74, +]); +const DIACRITICS_EXCEPTION_STR = [...DIACRITICS_EXCEPTION.values()] + .map(x => String.fromCharCode(x)) + .join(""); + +const DIACRITICS_REG_EXP = /\p{M}+/gu; +const SPECIAL_CHARS_REG_EXP = + /([.*+?^${}()|[\]\\])|(\p{P})|(\s+)|(\p{M})|(\p{L})/gu; +const NOT_DIACRITIC_FROM_END_REG_EXP = /([^\p{M}])\p{M}*$/u; +const NOT_DIACRITIC_FROM_START_REG_EXP = /^\p{M}*([^\p{M}])/u; + let normalizationRegex = null; function normalize(text) { + // The diacritics in the text or in the query can be composed or not. + // So we use a decomposed text using NFD (and the same for the query) + // in order to be sure that diacritics are in the same order. + if (!normalizationRegex) { // Compile the regular expression for text normalization once. const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join(""); - normalizationRegex = new RegExp(`[${replace}]`, "g"); + normalizationRegex = new RegExp( + `([${replace}])|(\\S-\\n)|(\\n)|(\\p{M}+)`, + "gum" + ); } - let diffs = null; - const normalizedText = text.replace(normalizationRegex, function (ch, index) { - const normalizedCh = CHARACTERS_TO_NORMALIZE[ch], - diff = normalizedCh.length - ch.length; - if (diff !== 0) { - (diffs ||= []).push([index, diff]); - } - return normalizedCh; - }); - return [normalizedText, diffs]; + // The goal of this function is to normalize the string and + // be able to get from an index in the new string the + // corresponding index in the old string. + // For example if we have: abCd12ef456gh where C is replaced by ccc + // and numbers replaced by nothing (it's the case for diacritics), then + // we'll obtain the normalized string: abcccdefgh. + // So here the reverse map is: [0,1,2,2,2,3,6,7,11,12]. + + // The goal is to obtain the array: [[0, 0], [3, -1], [4, -2], + // [6, 0], [8, 3]]. + // which can be used like this: + // - let say that i is the index in new string and j the index + // the old string. + // - if i is in [0; 3[ then j = i + 0 + // - if i is in [3; 4[ then j = i - 1 + // - if i is in [4; 6[ then j = i - 2 + // ... + // Thanks to a binary search it's easy to know where is i and what's the + // shift. + // Let say that the last entry in the array is [x, s] and we have a + // substitution at index y (old string) which will replace o chars by n chars. + // Firstly, if o === n, then no need to add a new entry: the shift is + // the same. + // Secondly, if o < n, then we push the n - o elements: + // [y - (s - 1), s - 1], [y - (s - 2), s - 2], ... + // Thirdly, if o > n, then we push the element: [y - (s - n), o + s - n] + + // Collect diacritics length and positions. + const rawDiacriticsPositions = []; + let m; + while ((m = DIACRITICS_REG_EXP.exec(text)) !== null) { + rawDiacriticsPositions.push([m[0].length, m.index]); + } + + let normalized = text.normalize("NFD"); + const positions = [[0, 0]]; + let k = 0; + let shift = 0; + let shiftOrigin = 0; + let eol = 0; + let hasDiacritics = false; + + normalized = normalized.replace( + normalizationRegex, + (match, p1, p2, p3, p4, i) => { + i -= shiftOrigin; + if (p1) { + // Maybe fractions or quotations mark... + const replacement = CHARACTERS_TO_NORMALIZE[match]; + const jj = replacement.length; + for (let j = 1; j < jj; j++) { + positions.push([i - shift + j, shift - j]); + } + shift -= jj - 1; + return replacement; + } + + if (p2) { + // "X-\n" is removed because an hyphen at the end of a line + // with not a space before is likely here to mark a break + // in a word. + positions.push([i - shift, 1 + shift]); + shift += 1; + shiftOrigin += 1; + eol += 1; + return p2.charAt(0); + } + + if (p3) { + // eol is replaced by space: "foo\nbar" is likely equivalent to + // "foo bar". + positions.push([i - shift + 1, shift - 1]); + shift -= 1; + shiftOrigin += 1; + eol += 1; + return " "; + } + + // Diacritics. + hasDiacritics = true; + let jj = p4.length; + if (i + eol === rawDiacriticsPositions[k]?.[1]) { + jj -= rawDiacriticsPositions[k][0]; + ++k; + } + + for (let j = 1; j < jj + 1; j++) { + // i is the position of the first diacritic + // so (i - 1) is the position for the letter before. + positions.push([i - 1 - shift + j, shift - j]); + } + shift -= jj; + shiftOrigin += jj; + + return p4; + } + ); + + positions.push([normalized.length, shift]); + + return [normalized, positions, hasDiacritics]; } // Determine the original, non-normalized, match index such that highlighting of // search results is correct in the `textLayer` for strings containing e.g. "½" // characters; essentially "inverting" the result of the `normalize` function. -function getOriginalIndex(matchIndex, diffs = null) { +function getOriginalIndex(diffs, pos, len) { if (!diffs) { - return matchIndex; + return [pos, len]; } - let totalDiff = 0; - for (const [index, diff] of diffs) { - const currentIndex = index + totalDiff; - if (currentIndex >= matchIndex) { - break; - } - if (currentIndex + diff > matchIndex) { - totalDiff += matchIndex - currentIndex; - break; - } - totalDiff += diff; + const start = pos; + const end = pos + len; + let i = binarySearchFirstItem(diffs, x => x[0] >= start); + if (diffs[i][0] > start) { + --i; } - return matchIndex - totalDiff; + + let j = binarySearchFirstItem(diffs, x => x[0] >= end, i); + if (diffs[j][0] > end) { + --j; + } + + return [start + diffs[i][1], len + diffs[j][1] - diffs[i][1]]; } /** @@ -276,6 +413,7 @@ class PDFFindController { this._extractTextPromises = []; this._pageContents = []; // Stores the normalized text for each page. this._pageDiffs = []; + this._hasDiacritics = []; this._matchesCountTotal = 0; this._pagesToSearch = null; this._pendingFindMatches = new Set(); @@ -331,192 +469,169 @@ class PDFFindController { return true; } - /** - * Helper for multi-term search that fills the `matchesWithLength` array - * and handles cases where one search term includes another search term (for - * example, "tamed tame" or "this is"). It looks for intersecting terms in - * the `matches` and keeps elements with a longer match length. - */ - _prepareMatches(matchesWithLength, matches, matchesLength) { - function isSubTerm(currentIndex) { - const currentElem = matchesWithLength[currentIndex]; - const nextElem = matchesWithLength[currentIndex + 1]; - - // Check for cases like "TAMEd TAME". - if ( - currentIndex < matchesWithLength.length - 1 && - currentElem.match === nextElem.match - ) { - currentElem.skipped = true; - return true; - } - - // Check for cases like "thIS IS". - for (let i = currentIndex - 1; i >= 0; i--) { - const prevElem = matchesWithLength[i]; - if (prevElem.skipped) { - continue; - } - if (prevElem.match + prevElem.matchLength < currentElem.match) { - break; - } - if ( - prevElem.match + prevElem.matchLength >= - currentElem.match + currentElem.matchLength - ) { - currentElem.skipped = true; - return true; - } - } - return false; - } - - // Sort the array of `{ match: , matchLength: }` - // objects on increasing index first and on the length otherwise. - matchesWithLength.sort(function (a, b) { - return a.match === b.match - ? a.matchLength - b.matchLength - : a.match - b.match; - }); - for (let i = 0, len = matchesWithLength.length; i < len; i++) { - if (isSubTerm(i)) { - continue; - } - matches.push(matchesWithLength[i].match); - matchesLength.push(matchesWithLength[i].matchLength); - } - } - /** * Determine if the search query constitutes a "whole word", by comparing the * first/last character type with the preceding/following character type. */ _isEntireWord(content, startIdx, length) { - if (startIdx > 0) { + let match = content + .slice(0, startIdx) + .match(NOT_DIACRITIC_FROM_END_REG_EXP); + if (match) { const first = content.charCodeAt(startIdx); - const limit = content.charCodeAt(startIdx - 1); + const limit = match[1].charCodeAt(0); if (getCharacterType(first) === getCharacterType(limit)) { return false; } } - const endIdx = startIdx + length - 1; - if (endIdx < content.length - 1) { - const last = content.charCodeAt(endIdx); - const limit = content.charCodeAt(endIdx + 1); + + match = content + .slice(startIdx + length) + .match(NOT_DIACRITIC_FROM_START_REG_EXP); + if (match) { + const last = content.charCodeAt(startIdx + length - 1); + const limit = match[1].charCodeAt(0); if (getCharacterType(last) === getCharacterType(limit)) { return false; } } + return true; } - _calculatePhraseMatch(query, pageIndex, pageContent, pageDiffs, entireWord) { + _calculateRegExpMatch(query, entireWord, pageIndex, pageContent) { const matches = [], matchesLength = []; - const queryLen = query.length; - let matchIdx = -queryLen; - while (true) { - matchIdx = pageContent.indexOf(query, matchIdx + queryLen); - if (matchIdx === -1) { - break; - } - if (entireWord && !this._isEntireWord(pageContent, matchIdx, queryLen)) { + const diffs = this._pageDiffs[pageIndex]; + let match; + while ((match = query.exec(pageContent)) !== null) { + if ( + entireWord && + !this._isEntireWord(pageContent, match.index, match[0].length) + ) { continue; } - const originalMatchIdx = getOriginalIndex(matchIdx, pageDiffs), - matchEnd = matchIdx + queryLen - 1, - originalQueryLen = - getOriginalIndex(matchEnd, pageDiffs) - originalMatchIdx + 1; - matches.push(originalMatchIdx); - matchesLength.push(originalQueryLen); + const [matchPos, matchLen] = getOriginalIndex( + diffs, + match.index, + match[0].length + ); + + if (matchLen) { + matches.push(matchPos); + matchesLength.push(matchLen); + } } this._pageMatches[pageIndex] = matches; this._pageMatchesLength[pageIndex] = matchesLength; } - _calculateWordMatch(query, pageIndex, pageContent, pageDiffs, entireWord) { - const matchesWithLength = []; + _convertToRegExpString(query, hasDiacritics) { + const { matchDiacritics } = this._state; + let isUnicode = false; + query = query.replace( + SPECIAL_CHARS_REG_EXP, + ( + match, + p1 /* to escape */, + p2 /* punctuation */, + p3 /* whitespaces */, + p4 /* diacritics */, + p5 /* letters */ + ) => { + // We don't need to use a \s for whitespaces since all the different + // kind of whitespaces are replaced by a single " ". - // Divide the query into pieces and search for text in each piece. - const queryArray = query.match(/\S+/g); - for (let i = 0, len = queryArray.length; i < len; i++) { - const subquery = queryArray[i]; - const subqueryLen = subquery.length; - - let matchIdx = -subqueryLen; - while (true) { - matchIdx = pageContent.indexOf(subquery, matchIdx + subqueryLen); - if (matchIdx === -1) { - break; + if (p1) { + // Escape characters like *+?... to not interfer with regexp syntax. + return `[ ]*\\${p1}[ ]*`; } - if ( - entireWord && - !this._isEntireWord(pageContent, matchIdx, subqueryLen) - ) { - continue; + if (p2) { + // Allow whitespaces around punctuation signs. + return `[ ]*${p2}[ ]*`; + } + if (p3) { + // Replace spaces by \s+ to be sure to match any spaces. + return "[ ]+"; + } + if (matchDiacritics) { + return p4 || p5; } - const originalMatchIdx = getOriginalIndex(matchIdx, pageDiffs), - matchEnd = matchIdx + subqueryLen - 1, - originalQueryLen = - getOriginalIndex(matchEnd, pageDiffs) - originalMatchIdx + 1; - // Other searches do not, so we store the length. - matchesWithLength.push({ - match: originalMatchIdx, - matchLength: originalQueryLen, - skipped: false, - }); + if (p4) { + // Diacritics are removed with few exceptions. + return DIACRITICS_EXCEPTION.has(p4.charCodeAt(0)) ? p4 : ""; + } + + // A letter has been matched and it can be followed by any diacritics + // in normalized text. + if (hasDiacritics) { + isUnicode = true; + return `${p5}\\p{M}*`; + } + return p5; + } + ); + + const trailingSpaces = "[ ]*"; + if (query.endsWith(trailingSpaces)) { + // The [ ]* has been added in order to help to match "foo . bar" but + // it doesn't make sense to match some whitespaces after the dot + // when it's the last character. + query = query.slice(0, query.length - trailingSpaces.length); + } + + if (matchDiacritics) { + // aX must not match aXY. + if (hasDiacritics) { + isUnicode = true; + query = `${query}(?=[${DIACRITICS_EXCEPTION_STR}]|[^\\p{M}]|$)`; } } - // Prepare arrays for storing the matches. - this._pageMatchesLength[pageIndex] = []; - this._pageMatches[pageIndex] = []; - - // Sort `matchesWithLength`, remove intersecting terms and put the result - // into the two arrays. - this._prepareMatches( - matchesWithLength, - this._pageMatches[pageIndex], - this._pageMatchesLength[pageIndex] - ); + return [isUnicode, query]; } _calculateMatch(pageIndex) { - let pageContent = this._pageContents[pageIndex]; - const pageDiffs = this._pageDiffs[pageIndex]; let query = this._query; - const { caseSensitive, entireWord, phraseSearch } = this._state; - if (query.length === 0) { // Do nothing: the matches should be wiped out already. return; } - if (!caseSensitive) { - pageContent = pageContent.toLowerCase(); - query = query.toLowerCase(); + const { caseSensitive, entireWord, phraseSearch } = this._state; + const pageContent = this._pageContents[pageIndex]; + const hasDiacritics = this._hasDiacritics[pageIndex]; + + let isUnicode = false; + if (phraseSearch) { + [isUnicode, query] = this._convertToRegExpString(query, hasDiacritics); + } else { + // Words are sorted in reverse order to be sure that "foobar" is matched + // before "foo" in case the query is "foobar foo". + const match = query.match(/\S+/g); + if (match) { + query = match + .sort() + .reverse() + .map(q => { + const [isUnicodePart, queryPart] = this._convertToRegExpString( + q, + hasDiacritics + ); + isUnicode ||= isUnicodePart; + return `(${queryPart})`; + }) + .join("|"); + } } - if (phraseSearch) { - this._calculatePhraseMatch( - query, - pageIndex, - pageContent, - pageDiffs, - entireWord - ); - } else { - this._calculateWordMatch( - query, - pageIndex, - pageContent, - pageDiffs, - entireWord - ); - } + const flags = `g${isUnicode ? "u" : ""}${caseSensitive ? "" : "i"}`; + query = new RegExp(query, flags); + + this._calculateRegExpMatch(query, entireWord, pageIndex, pageContent); // When `highlightAll` is set, ensure that the matches on previously // rendered (and still active) pages are correctly highlighted. @@ -560,12 +675,17 @@ class PDFFindController { for (let j = 0, jj = textItems.length; j < jj; j++) { strBuf.push(textItems[j].str); + if (textItems[j].hasEOL) { + strBuf.push("\n"); + } } // Store the normalized page content (text items) as one string. - [this._pageContents[i], this._pageDiffs[i]] = normalize( - strBuf.join("") - ); + [ + this._pageContents[i], + this._pageDiffs[i], + this._hasDiacritics[i], + ] = normalize(strBuf.join("")); extractTextCapability.resolve(i); }, reason => { @@ -576,6 +696,7 @@ class PDFFindController { // Page error -- assuming no text content. this._pageContents[i] = ""; this._pageDiffs[i] = null; + this._hasDiacritics[i] = false; extractTextCapability.resolve(i); } ); diff --git a/web/ui_utils.js b/web/ui_utils.js index 54593573a..86b4bd907 100644 --- a/web/ui_utils.js +++ b/web/ui_utils.js @@ -227,8 +227,8 @@ function removeNullCharacters(str, replaceInvisible = false) { * @returns {number} Index of the first array element to pass the test, * or |items.length| if no such element exists. */ -function binarySearchFirstItem(items, condition) { - let minIndex = 0; +function binarySearchFirstItem(items, condition, start = 0) { + let minIndex = start; let maxIndex = items.length - 1; if (maxIndex < 0 || !condition(items[maxIndex])) { diff --git a/web/viewer.html b/web/viewer.html index 2c22f29ab..e0924307c 100644 --- a/web/viewer.html +++ b/web/viewer.html @@ -135,8 +135,13 @@ See https://github.com/adobe-type-tools/cmap-resources
- + + + +
+ +
diff --git a/web/viewer.js b/web/viewer.js index ef73a0a2a..0f57f7f43 100644 --- a/web/viewer.js +++ b/web/viewer.js @@ -153,6 +153,7 @@ function getViewerConfiguration() { findField: document.getElementById("findInput"), highlightAllCheckbox: document.getElementById("findHighlightAll"), caseSensitiveCheckbox: document.getElementById("findMatchCase"), + matchDiacriticsCheckbox: document.getElementById("findMatchDiacritics"), entireWordCheckbox: document.getElementById("findEntireWord"), findMsg: document.getElementById("findMsg"), findResultsCount: document.getElementById("findResultsCount"),