[api-minor] Highlight search results correctly for normalized text (PR 9448)

This patch is a rebased *and* refactored version of PR 9448, such that it applies cleanly given that `PDFFindController` has changed since that PR was opened; obviously keeping the original author information intact.

This patch will thus ensure that e.g. fractions, and other things that we normalize before searching, will still be highlighted correctly in the textLayer.

Furthermore, this patch also adds basic unit-tests for this functionality.

*Note:* The `[api-minor]` tag is added, since third-party implementations of the `PDFFindController` must now always use the `pageMatchesLength` property to get accurate length information (see the `web/text_layer_builder.js` changes).

Co-authored-by: Ross Johnson <ross@mazira.com>
Co-authored-by: Jonas Jenwald <jonas.jenwald@gmail.com>
This commit is contained in:
Ross Johnson 2021-01-12 15:21:19 +01:00 committed by Jonas Jenwald
parent 1de1ae0be6
commit 6dae2677d5
6 changed files with 220 additions and 106 deletions

View File

@ -6,6 +6,7 @@
!TrueType_without_cmap.pdf !TrueType_without_cmap.pdf
!franz.pdf !franz.pdf
!franz_2.pdf !franz_2.pdf
!fraction-highlight.pdf
!german-umlaut-r.pdf !german-umlaut-r.pdf
!xref_command_missing.pdf !xref_command_missing.pdf
!issue1155r.pdf !issue1155r.pdf

Binary file not shown.

View File

@ -18,6 +18,9 @@ import { isNodeJS } from "../../src/shared/is_node.js";
import { PDFNodeStream } from "../../src/display/node_stream.js"; import { PDFNodeStream } from "../../src/display/node_stream.js";
import { setPDFNetworkStreamFactory } from "../../src/display/api.js"; import { setPDFNetworkStreamFactory } from "../../src/display/api.js";
// Sets longer timeout, similar to `jasmine-boot.js`.
jasmine.DEFAULT_TIMEOUT_INTERVAL = 30000;
// Ensure that this script only runs in Node.js environments. // Ensure that this script only runs in Node.js environments.
if (!isNodeJS) { if (!isNodeJS) {
throw new Error( throw new Error(

View File

@ -19,6 +19,8 @@ import { getDocument } from "../../src/display/api.js";
import { PDFFindController } from "../../web/pdf_find_controller.js"; import { PDFFindController } from "../../web/pdf_find_controller.js";
import { SimpleLinkService } from "../../web/pdf_link_service.js"; import { SimpleLinkService } from "../../web/pdf_link_service.js";
const tracemonkeyFileName = "tracemonkey.pdf";
class MockLinkService extends SimpleLinkService { class MockLinkService extends SimpleLinkService {
constructor() { constructor() {
super(); super();
@ -44,89 +46,102 @@ class MockLinkService extends SimpleLinkService {
} }
} }
describe("pdf_find_controller", function () { async function initPdfFindController(filename) {
let eventBus; const loadingTask = getDocument(
let pdfFindController; buildGetDocumentParams(filename || tracemonkeyFileName)
);
const pdfDocument = await loadingTask.promise;
beforeEach(function (done) { const eventBus = new EventBus();
const loadingTask = getDocument(buildGetDocumentParams("tracemonkey.pdf"));
loadingTask.promise.then(function (pdfDocument) {
eventBus = new EventBus();
const linkService = new MockLinkService(); const linkService = new MockLinkService();
linkService.setDocument(pdfDocument); linkService.setDocument(pdfDocument);
pdfFindController = new PDFFindController({ const pdfFindController = new PDFFindController({
linkService, linkService,
eventBus, eventBus,
});
pdfFindController.setDocument(pdfDocument); // Enable searching.
done();
});
}); });
pdfFindController.setDocument(pdfDocument); // Enable searching.
afterEach(function () { return { eventBus, pdfFindController };
eventBus = null; }
pdfFindController = null;
});
function testSearch({ parameters, matchesPerPage, selectedMatch }) { function testSearch({
return new Promise(function (resolve) { eventBus,
pdfFindController.executeCommand("find", parameters); pdfFindController,
parameters,
matchesPerPage,
selectedMatch,
pageMatches = null,
pageMatchesLength = null,
}) {
return new Promise(function (resolve) {
pdfFindController.executeCommand("find", parameters);
// The `updatefindmatchescount` event is only emitted if the page contains // The `updatefindmatchescount` event is only emitted if the page contains
// at least one match for the query, so the last non-zero item in the // at least one match for the query, so the last non-zero item in the
// matches per page array corresponds to the page for which the final // matches per page array corresponds to the page for which the final
// `updatefindmatchescount` event is emitted. If this happens, we know // `updatefindmatchescount` event is emitted. If this happens, we know
// that any subsequent pages won't trigger the event anymore and we // that any subsequent pages won't trigger the event anymore and we
// can start comparing the matches per page. This logic is necessary // can start comparing the matches per page. This logic is necessary
// because we call the `pdfFindController.pageMatches` getter directly // because we call the `pdfFindController.pageMatches` getter directly
// after receiving the event and the underlying `_pageMatches` array // after receiving the event and the underlying `_pageMatches` array
// is only extended when a page is processed, so it will only contain // is only extended when a page is processed, so it will only contain
// entries for the pages processed until the time when the final event // entries for the pages processed until the time when the final event
// was emitted. // was emitted.
let totalPages = matchesPerPage.length; let totalPages = matchesPerPage.length;
for (let i = totalPages - 1; i >= 0; i--) { for (let i = totalPages - 1; i >= 0; i--) {
if (matchesPerPage[i] > 0) { if (matchesPerPage[i] > 0) {
totalPages = i + 1; totalPages = i + 1;
break; break;
}
} }
}
const totalMatches = matchesPerPage.reduce((a, b) => { const totalMatches = matchesPerPage.reduce((a, b) => {
return a + b; return a + b;
});
eventBus.on(
"updatefindmatchescount",
function onUpdateFindMatchesCount(evt) {
if (pdfFindController.pageMatches.length !== totalPages) {
return;
}
eventBus.off("updatefindmatchescount", onUpdateFindMatchesCount);
expect(evt.matchesCount.total).toBe(totalMatches);
for (let i = 0; i < totalPages; i++) {
expect(pdfFindController.pageMatches[i].length).toEqual(
matchesPerPage[i]
);
}
expect(pdfFindController.selected.pageIdx).toEqual(
selectedMatch.pageIndex
);
expect(pdfFindController.selected.matchIdx).toEqual(
selectedMatch.matchIndex
);
resolve();
}
);
}); });
}
it("performs a normal search", function (done) { eventBus.on(
testSearch({ "updatefindmatchescount",
function onUpdateFindMatchesCount(evt) {
if (pdfFindController.pageMatches.length !== totalPages) {
return;
}
eventBus.off("updatefindmatchescount", onUpdateFindMatchesCount);
expect(evt.matchesCount.total).toBe(totalMatches);
for (let i = 0; i < totalPages; i++) {
expect(pdfFindController.pageMatches[i].length).toEqual(
matchesPerPage[i]
);
}
expect(pdfFindController.selected.pageIdx).toEqual(
selectedMatch.pageIndex
);
expect(pdfFindController.selected.matchIdx).toEqual(
selectedMatch.matchIndex
);
if (pageMatches) {
expect(pdfFindController.pageMatches).toEqual(pageMatches);
expect(pdfFindController.pageMatchesLength).toEqual(
pageMatchesLength
);
}
resolve();
}
);
});
}
describe("pdf_find_controller", function () {
it("performs a normal search", async function () {
const { eventBus, pdfFindController } = await initPdfFindController();
await testSearch({
eventBus,
pdfFindController,
parameters: { parameters: {
query: "Dynamic", query: "Dynamic",
caseSensitive: false, caseSensitive: false,
@ -139,14 +154,18 @@ describe("pdf_find_controller", function () {
pageIndex: 0, pageIndex: 0,
matchIndex: 0, matchIndex: 0,
}, },
}).then(done); });
}); });
it("performs a normal search and finds the previous result", function (done) { it("performs a normal search and finds the previous result", async function () {
// Page 14 (with page index 13) contains five results. By default, the // Page 14 (with page index 13) contains five results. By default, the
// first result (match index 0) is selected, so the previous result // first result (match index 0) is selected, so the previous result
// should be the fifth result (match index 4). // should be the fifth result (match index 4).
testSearch({ const { eventBus, pdfFindController } = await initPdfFindController();
await testSearch({
eventBus,
pdfFindController,
parameters: { parameters: {
query: "conference", query: "conference",
caseSensitive: false, caseSensitive: false,
@ -159,11 +178,15 @@ describe("pdf_find_controller", function () {
pageIndex: 13, pageIndex: 13,
matchIndex: 4, matchIndex: 4,
}, },
}).then(done); });
}); });
it("performs a case sensitive search", function (done) { it("performs a case sensitive search", async function () {
testSearch({ const { eventBus, pdfFindController } = await initPdfFindController();
await testSearch({
eventBus,
pdfFindController,
parameters: { parameters: {
query: "Dynamic", query: "Dynamic",
caseSensitive: true, caseSensitive: true,
@ -176,13 +199,17 @@ describe("pdf_find_controller", function () {
pageIndex: 0, pageIndex: 0,
matchIndex: 0, matchIndex: 0,
}, },
}).then(done); });
}); });
it("performs an entire word search", function (done) { it("performs an entire word search", async function () {
// Page 13 contains both 'Government' and 'Governmental', so the latter // Page 13 contains both 'Government' and 'Governmental', so the latter
// should not be found with entire word search. // should not be found with entire word search.
testSearch({ const { eventBus, pdfFindController } = await initPdfFindController();
await testSearch({
eventBus,
pdfFindController,
parameters: { parameters: {
query: "Government", query: "Government",
caseSensitive: false, caseSensitive: false,
@ -195,13 +222,17 @@ describe("pdf_find_controller", function () {
pageIndex: 12, pageIndex: 12,
matchIndex: 0, matchIndex: 0,
}, },
}).then(done); });
}); });
it("performs a multiple term (no phrase) search", function (done) { it("performs a multiple term (no phrase) search", async function () {
// Page 9 contains 'alternate' and pages 6 and 9 contain 'solution'. // Page 9 contains 'alternate' and pages 6 and 9 contain 'solution'.
// Both should be found for multiple term (no phrase) search. // Both should be found for multiple term (no phrase) search.
testSearch({ const { eventBus, pdfFindController } = await initPdfFindController();
await testSearch({
eventBus,
pdfFindController,
parameters: { parameters: {
query: "alternate solution", query: "alternate solution",
caseSensitive: false, caseSensitive: false,
@ -214,6 +245,31 @@ describe("pdf_find_controller", function () {
pageIndex: 5, pageIndex: 5,
matchIndex: 0, matchIndex: 0,
}, },
}).then(done); });
});
it("performs a normal search, where the text is normalized", async function () {
const { eventBus, pdfFindController } = await initPdfFindController(
"fraction-highlight.pdf"
);
await testSearch({
eventBus,
pdfFindController,
parameters: {
query: "fraction",
caseSensitive: false,
entireWord: false,
phraseSearch: true,
findPrevious: false,
},
matchesPerPage: [3],
selectedMatch: {
pageIndex: 0,
matchIndex: 0,
},
pageMatches: [[19, 48, 66]],
pageMatchesLength: [[8, 8, 8]],
});
}); });
}); });

View File

@ -49,9 +49,40 @@ function normalize(text) {
const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join(""); const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join("");
normalizationRegex = new RegExp(`[${replace}]`, "g"); normalizationRegex = new RegExp(`[${replace}]`, "g");
} }
return text.replace(normalizationRegex, function (ch) { let diffs = null;
return CHARACTERS_TO_NORMALIZE[ch]; const normalizedText = text.replace(normalizationRegex, function (ch, index) {
const normalizedCh = CHARACTERS_TO_NORMALIZE[ch],
diff = normalizedCh.length - ch.length;
if (diff !== 0) {
(diffs ||= []).push([index, diff]);
}
return normalizedCh;
}); });
return [normalizedText, diffs];
}
// Determine the original, non-normalized, match index such that highlighting of
// search results is correct in the `textLayer` for strings containing e.g. "½"
// characters; essentially "inverting" the result of the `normalize` function.
function getOriginalIndex(matchIndex, diffs = null) {
if (!diffs) {
return matchIndex;
}
let totalDiff = 0;
for (const [index, diff] of diffs) {
const currentIndex = index + totalDiff;
if (currentIndex >= matchIndex) {
break;
}
if (currentIndex + diff > matchIndex) {
totalDiff += matchIndex - currentIndex;
break;
}
totalDiff += diff;
}
return matchIndex - totalDiff;
} }
/** /**
@ -215,6 +246,7 @@ class PDFFindController {
}; };
this._extractTextPromises = []; this._extractTextPromises = [];
this._pageContents = []; // Stores the normalized text for each page. this._pageContents = []; // Stores the normalized text for each page.
this._pageDiffs = [];
this._matchesCountTotal = 0; this._matchesCountTotal = 0;
this._pagesToSearch = null; this._pagesToSearch = null;
this._pendingFindMatches = Object.create(null); this._pendingFindMatches = Object.create(null);
@ -232,7 +264,7 @@ class PDFFindController {
get _query() { get _query() {
if (this._state.query !== this._rawQuery) { if (this._state.query !== this._rawQuery) {
this._rawQuery = this._state.query; this._rawQuery = this._state.query;
this._normalizedQuery = normalize(this._state.query); [this._normalizedQuery] = normalize(this._state.query);
} }
return this._normalizedQuery; return this._normalizedQuery;
} }
@ -349,8 +381,9 @@ class PDFFindController {
return true; return true;
} }
_calculatePhraseMatch(query, pageIndex, pageContent, entireWord) { _calculatePhraseMatch(query, pageIndex, pageContent, pageDiffs, entireWord) {
const matches = []; const matches = [],
matchesLength = [];
const queryLen = query.length; const queryLen = query.length;
let matchIdx = -queryLen; let matchIdx = -queryLen;
@ -362,12 +395,19 @@ class PDFFindController {
if (entireWord && !this._isEntireWord(pageContent, matchIdx, queryLen)) { if (entireWord && !this._isEntireWord(pageContent, matchIdx, queryLen)) {
continue; continue;
} }
matches.push(matchIdx); const originalMatchIdx = getOriginalIndex(matchIdx, pageDiffs),
matchEnd = matchIdx + queryLen - 1,
originalQueryLen =
getOriginalIndex(matchEnd, pageDiffs) - originalMatchIdx + 1;
matches.push(originalMatchIdx);
matchesLength.push(originalQueryLen);
} }
this._pageMatches[pageIndex] = matches; this._pageMatches[pageIndex] = matches;
this._pageMatchesLength[pageIndex] = matchesLength;
} }
_calculateWordMatch(query, pageIndex, pageContent, entireWord) { _calculateWordMatch(query, pageIndex, pageContent, pageDiffs, entireWord) {
const matchesWithLength = []; const matchesWithLength = [];
// Divide the query into pieces and search for text in each piece. // Divide the query into pieces and search for text in each piece.
@ -388,10 +428,15 @@ class PDFFindController {
) { ) {
continue; continue;
} }
const originalMatchIdx = getOriginalIndex(matchIdx, pageDiffs),
matchEnd = matchIdx + subqueryLen - 1,
originalQueryLen =
getOriginalIndex(matchEnd, pageDiffs) - originalMatchIdx + 1;
// Other searches do not, so we store the length. // Other searches do not, so we store the length.
matchesWithLength.push({ matchesWithLength.push({
match: matchIdx, match: originalMatchIdx,
matchLength: subqueryLen, matchLength: originalQueryLen,
skipped: false, skipped: false,
}); });
} }
@ -412,6 +457,7 @@ class PDFFindController {
_calculateMatch(pageIndex) { _calculateMatch(pageIndex) {
let pageContent = this._pageContents[pageIndex]; let pageContent = this._pageContents[pageIndex];
const pageDiffs = this._pageDiffs[pageIndex];
let query = this._query; let query = this._query;
const { caseSensitive, entireWord, phraseSearch } = this._state; const { caseSensitive, entireWord, phraseSearch } = this._state;
@ -426,9 +472,21 @@ class PDFFindController {
} }
if (phraseSearch) { if (phraseSearch) {
this._calculatePhraseMatch(query, pageIndex, pageContent, entireWord); this._calculatePhraseMatch(
query,
pageIndex,
pageContent,
pageDiffs,
entireWord
);
} else { } else {
this._calculateWordMatch(query, pageIndex, pageContent, entireWord); this._calculateWordMatch(
query,
pageIndex,
pageContent,
pageDiffs,
entireWord
);
} }
// When `highlightAll` is set, ensure that the matches on previously // When `highlightAll` is set, ensure that the matches on previously
@ -478,7 +536,9 @@ class PDFFindController {
} }
// Store the normalized page content (text items) as one string. // Store the normalized page content (text items) as one string.
this._pageContents[i] = normalize(strBuf.join("")); [this._pageContents[i], this._pageDiffs[i]] = normalize(
strBuf.join("")
);
extractTextCapability.resolve(i); extractTextCapability.resolve(i);
}, },
reason => { reason => {
@ -488,6 +548,7 @@ class PDFFindController {
); );
// Page error -- assuming no text content. // Page error -- assuming no text content.
this._pageContents[i] = ""; this._pageContents[i] = "";
this._pageDiffs[i] = null;
extractTextCapability.resolve(i); extractTextCapability.resolve(i);
} }
); );

View File

@ -161,12 +161,11 @@ class TextLayerBuilder {
if (!matches) { if (!matches) {
return []; return [];
} }
const { findController, textContentItemsStr } = this; const { textContentItemsStr } = this;
let i = 0, let i = 0,
iIndex = 0; iIndex = 0;
const end = textContentItemsStr.length - 1; const end = textContentItemsStr.length - 1;
const queryLen = findController.state.query.length;
const result = []; const result = [];
for (let m = 0, mm = matches.length; m < mm; m++) { for (let m = 0, mm = matches.length; m < mm; m++) {
@ -191,13 +190,7 @@ class TextLayerBuilder {
}; };
// Calculate the end position. // Calculate the end position.
if (matchesLength) { matchIdx += matchesLength[m];
// Multiterm search.
matchIdx += matchesLength[m];
} else {
// Phrase search.
matchIdx += queryLen;
}
// Somewhat the same array as above, but use > instead of >= to get // Somewhat the same array as above, but use > instead of >= to get
// the end position right. // the end position right.