[api-minor] Don't add in the text content the chars which are out-of-page (bug 1755201)

- it aims to fix https://bugzilla.mozilla.org/show_bug.cgi?id=1755201;
- if the glyph position is not within the view then skip it.
This commit is contained in:
Calixte Denizet 2022-02-13 19:39:40 +01:00
parent 78246719f8
commit 18e3a98c2b
5 changed files with 58 additions and 13 deletions

View File

@ -471,6 +471,7 @@ class Page {
includeMarkedContent,
combineTextItems,
sink,
viewBox: this.view,
});
});
}

View File

@ -2167,6 +2167,7 @@ class PartialEvaluator {
includeMarkedContent = false,
sink,
seenStyles = new Set(),
viewBox,
}) {
// Ensure that `resources`/`stateManager` is correctly initialized,
// even if the provided parameter is e.g. `null`.
@ -2393,22 +2394,35 @@ class PartialEvaluator {
}
function compareWithLastPosition() {
const currentTransform = getCurrentTextTransform();
let posX = currentTransform[4];
let posY = currentTransform[5];
const shiftedX = posX - viewBox[0];
const shiftedY = posY - viewBox[1];
if (
shiftedX < 0 ||
shiftedX > viewBox[2] ||
shiftedY < 0 ||
shiftedY > viewBox[3]
) {
return false;
}
if (
!combineTextItems ||
!textState.font ||
!textContentItem.prevTransform
) {
return;
return true;
}
const currentTransform = getCurrentTextTransform();
let posX = currentTransform[4];
let posY = currentTransform[5];
let lastPosX = textContentItem.prevTransform[4];
let lastPosY = textContentItem.prevTransform[5];
if (lastPosX === posX && lastPosY === posY) {
return;
return true;
}
let rotate = -1;
@ -2473,16 +2487,16 @@ class PartialEvaluator {
0.5 * textContentItem.width /* not the same column */
) {
appendEOL();
return;
return true;
}
flushTextContentItem();
return;
return true;
}
if (Math.abs(advanceX) > textContentItem.width) {
appendEOL();
return;
return true;
}
if (advanceY <= textOrientation * textContentItem.trackingSpaceMin) {
textContentItem.height += advanceY;
@ -2508,7 +2522,7 @@ class PartialEvaluator {
}
}
return;
return true;
}
const advanceX = (posX - lastPosX) / textContentItem.textAdvanceScale;
@ -2523,15 +2537,15 @@ class PartialEvaluator {
0.5 * textContentItem.height /* not the same line */
) {
appendEOL();
return;
return true;
}
flushTextContentItem();
return;
return true;
}
if (Math.abs(advanceY) > textContentItem.height) {
appendEOL();
return;
return true;
}
if (advanceX <= textOrientation * textContentItem.trackingSpaceMin) {
@ -2553,6 +2567,8 @@ class PartialEvaluator {
textContentItem.width += advanceX;
}
}
return true;
}
function buildTextContentItem({ chars, extraSpacing }) {
@ -2617,7 +2633,10 @@ class PartialEvaluator {
continue;
}
compareWithLastPosition();
if (!compareWithLastPosition()) {
// The glyph is not in page so just skip it.
continue;
}
// Must be called after compareWithLastPosition because
// the textContentItem could have been flushed.
@ -3026,6 +3045,7 @@ class PartialEvaluator {
includeMarkedContent,
sink: sinkWrapper,
seenStyles,
viewBox,
})
.then(function () {
if (!sinkWrapper.enqueueInvoked) {

View File

@ -0,0 +1 @@
https://bugzilla.mozilla.org/attachment.cgi?id=9263657

View File

@ -1,4 +1,11 @@
[
{ "id": "bug1755201",
"file": "pdfs/bug1755201.pdf",
"md5": "cece14097812d8a1f69e86a51e4a3804",
"rounds": 1,
"link": true,
"type": "other"
},
{ "id": "filled-background-range",
"file": "pdfs/filled-background.pdf",
"md5": "2e3120255d9c3e79b96d2543b12d2589",

View File

@ -2219,6 +2219,22 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
await loadingTask.destroy();
});
it("gets text content, and check that out-of-page text is not present (bug 1755201)", async function () {
if (isNodeJS) {
pending("Linked test-cases are not supported in Node.js.");
}
const loadingTask = getDocument(buildGetDocumentParams("bug1755201.pdf"));
const pdfDoc = await loadingTask.promise;
const pdfPage = await pdfDoc.getPage(6);
const { items } = await pdfPage.getTextContent();
const text = mergeText(items);
expect(/win aisle/.test(text)).toEqual(false);
await loadingTask.destroy();
});
it("gets empty structure tree", async function () {
const tree = await page.getStructTree();