[api-minor] Don't add in the text content the chars which are out-of-page (bug 1755201)

- it aims to fix https://bugzilla.mozilla.org/show_bug.cgi?id=1755201;
- if the glyph position is not within the view then skip it.
This commit is contained in:
Calixte Denizet 2022-02-13 19:39:40 +01:00
parent 78246719f8
commit 18e3a98c2b
5 changed files with 58 additions and 13 deletions

View File

@ -471,6 +471,7 @@ class Page {
includeMarkedContent, includeMarkedContent,
combineTextItems, combineTextItems,
sink, sink,
viewBox: this.view,
}); });
}); });
} }

View File

@ -2167,6 +2167,7 @@ class PartialEvaluator {
includeMarkedContent = false, includeMarkedContent = false,
sink, sink,
seenStyles = new Set(), seenStyles = new Set(),
viewBox,
}) { }) {
// Ensure that `resources`/`stateManager` is correctly initialized, // Ensure that `resources`/`stateManager` is correctly initialized,
// even if the provided parameter is e.g. `null`. // even if the provided parameter is e.g. `null`.
@ -2393,22 +2394,35 @@ class PartialEvaluator {
} }
function compareWithLastPosition() { function compareWithLastPosition() {
const currentTransform = getCurrentTextTransform();
let posX = currentTransform[4];
let posY = currentTransform[5];
const shiftedX = posX - viewBox[0];
const shiftedY = posY - viewBox[1];
if (
shiftedX < 0 ||
shiftedX > viewBox[2] ||
shiftedY < 0 ||
shiftedY > viewBox[3]
) {
return false;
}
if ( if (
!combineTextItems || !combineTextItems ||
!textState.font || !textState.font ||
!textContentItem.prevTransform !textContentItem.prevTransform
) { ) {
return; return true;
} }
const currentTransform = getCurrentTextTransform();
let posX = currentTransform[4];
let posY = currentTransform[5];
let lastPosX = textContentItem.prevTransform[4]; let lastPosX = textContentItem.prevTransform[4];
let lastPosY = textContentItem.prevTransform[5]; let lastPosY = textContentItem.prevTransform[5];
if (lastPosX === posX && lastPosY === posY) { if (lastPosX === posX && lastPosY === posY) {
return; return true;
} }
let rotate = -1; let rotate = -1;
@ -2473,16 +2487,16 @@ class PartialEvaluator {
0.5 * textContentItem.width /* not the same column */ 0.5 * textContentItem.width /* not the same column */
) { ) {
appendEOL(); appendEOL();
return; return true;
} }
flushTextContentItem(); flushTextContentItem();
return; return true;
} }
if (Math.abs(advanceX) > textContentItem.width) { if (Math.abs(advanceX) > textContentItem.width) {
appendEOL(); appendEOL();
return; return true;
} }
if (advanceY <= textOrientation * textContentItem.trackingSpaceMin) { if (advanceY <= textOrientation * textContentItem.trackingSpaceMin) {
textContentItem.height += advanceY; textContentItem.height += advanceY;
@ -2508,7 +2522,7 @@ class PartialEvaluator {
} }
} }
return; return true;
} }
const advanceX = (posX - lastPosX) / textContentItem.textAdvanceScale; const advanceX = (posX - lastPosX) / textContentItem.textAdvanceScale;
@ -2523,15 +2537,15 @@ class PartialEvaluator {
0.5 * textContentItem.height /* not the same line */ 0.5 * textContentItem.height /* not the same line */
) { ) {
appendEOL(); appendEOL();
return; return true;
} }
flushTextContentItem(); flushTextContentItem();
return; return true;
} }
if (Math.abs(advanceY) > textContentItem.height) { if (Math.abs(advanceY) > textContentItem.height) {
appendEOL(); appendEOL();
return; return true;
} }
if (advanceX <= textOrientation * textContentItem.trackingSpaceMin) { if (advanceX <= textOrientation * textContentItem.trackingSpaceMin) {
@ -2553,6 +2567,8 @@ class PartialEvaluator {
textContentItem.width += advanceX; textContentItem.width += advanceX;
} }
} }
return true;
} }
function buildTextContentItem({ chars, extraSpacing }) { function buildTextContentItem({ chars, extraSpacing }) {
@ -2617,7 +2633,10 @@ class PartialEvaluator {
continue; continue;
} }
compareWithLastPosition(); if (!compareWithLastPosition()) {
// The glyph is not in page so just skip it.
continue;
}
// Must be called after compareWithLastPosition because // Must be called after compareWithLastPosition because
// the textContentItem could have been flushed. // the textContentItem could have been flushed.
@ -3026,6 +3045,7 @@ class PartialEvaluator {
includeMarkedContent, includeMarkedContent,
sink: sinkWrapper, sink: sinkWrapper,
seenStyles, seenStyles,
viewBox,
}) })
.then(function () { .then(function () {
if (!sinkWrapper.enqueueInvoked) { if (!sinkWrapper.enqueueInvoked) {

View File

@ -0,0 +1 @@
https://bugzilla.mozilla.org/attachment.cgi?id=9263657

View File

@ -1,4 +1,11 @@
[ [
{ "id": "bug1755201",
"file": "pdfs/bug1755201.pdf",
"md5": "cece14097812d8a1f69e86a51e4a3804",
"rounds": 1,
"link": true,
"type": "other"
},
{ "id": "filled-background-range", { "id": "filled-background-range",
"file": "pdfs/filled-background.pdf", "file": "pdfs/filled-background.pdf",
"md5": "2e3120255d9c3e79b96d2543b12d2589", "md5": "2e3120255d9c3e79b96d2543b12d2589",

View File

@ -2219,6 +2219,22 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
await loadingTask.destroy(); await loadingTask.destroy();
}); });
it("gets text content, and check that out-of-page text is not present (bug 1755201)", async function () {
if (isNodeJS) {
pending("Linked test-cases are not supported in Node.js.");
}
const loadingTask = getDocument(buildGetDocumentParams("bug1755201.pdf"));
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(6);
const { items } = await pdfPage.getTextContent();
const text = mergeText(items);
expect(/win aisle/.test(text)).toEqual(false);
await loadingTask.destroy();
});
it("gets empty structure tree", async function () { it("gets empty structure tree", async function () {
const tree = await page.getStructTree(); const tree = await page.getStructTree();