From 18e3a98c2bbfc63e0603b66a49f9e8a2ff58e09c Mon Sep 17 00:00:00 2001 From: Calixte Denizet Date: Sun, 13 Feb 2022 19:39:40 +0100 Subject: [PATCH] [api-minor] Don't add in the text content the chars which are out-of-page (bug 1755201) - it aims to fix https://bugzilla.mozilla.org/show_bug.cgi?id=1755201; - if the glyph position is not within the view then skip it. --- src/core/document.js | 1 + src/core/evaluator.js | 46 +++++++++++++++++++++++++---------- test/pdfs/bug1755201.pdf.link | 1 + test/test_manifest.json | 7 ++++++ test/unit/api_spec.js | 16 ++++++++++++ 5 files changed, 58 insertions(+), 13 deletions(-) create mode 100644 test/pdfs/bug1755201.pdf.link diff --git a/src/core/document.js b/src/core/document.js index 848e05512..1ebff3ab3 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -471,6 +471,7 @@ class Page { includeMarkedContent, combineTextItems, sink, + viewBox: this.view, }); }); } diff --git a/src/core/evaluator.js b/src/core/evaluator.js index b510e4a93..3cfb89ec8 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -2167,6 +2167,7 @@ class PartialEvaluator { includeMarkedContent = false, sink, seenStyles = new Set(), + viewBox, }) { // Ensure that `resources`/`stateManager` is correctly initialized, // even if the provided parameter is e.g. `null`. @@ -2393,22 +2394,35 @@ class PartialEvaluator { } function compareWithLastPosition() { + const currentTransform = getCurrentTextTransform(); + let posX = currentTransform[4]; + let posY = currentTransform[5]; + + const shiftedX = posX - viewBox[0]; + const shiftedY = posY - viewBox[1]; + + if ( + shiftedX < 0 || + shiftedX > viewBox[2] || + shiftedY < 0 || + shiftedY > viewBox[3] + ) { + return false; + } + if ( !combineTextItems || !textState.font || !textContentItem.prevTransform ) { - return; + return true; } - const currentTransform = getCurrentTextTransform(); - let posX = currentTransform[4]; - let posY = currentTransform[5]; let lastPosX = textContentItem.prevTransform[4]; let lastPosY = textContentItem.prevTransform[5]; if (lastPosX === posX && lastPosY === posY) { - return; + return true; } let rotate = -1; @@ -2473,16 +2487,16 @@ class PartialEvaluator { 0.5 * textContentItem.width /* not the same column */ ) { appendEOL(); - return; + return true; } flushTextContentItem(); - return; + return true; } if (Math.abs(advanceX) > textContentItem.width) { appendEOL(); - return; + return true; } if (advanceY <= textOrientation * textContentItem.trackingSpaceMin) { textContentItem.height += advanceY; @@ -2508,7 +2522,7 @@ class PartialEvaluator { } } - return; + return true; } const advanceX = (posX - lastPosX) / textContentItem.textAdvanceScale; @@ -2523,15 +2537,15 @@ class PartialEvaluator { 0.5 * textContentItem.height /* not the same line */ ) { appendEOL(); - return; + return true; } flushTextContentItem(); - return; + return true; } if (Math.abs(advanceY) > textContentItem.height) { appendEOL(); - return; + return true; } if (advanceX <= textOrientation * textContentItem.trackingSpaceMin) { @@ -2553,6 +2567,8 @@ class PartialEvaluator { textContentItem.width += advanceX; } } + + return true; } function buildTextContentItem({ chars, extraSpacing }) { @@ -2617,7 +2633,10 @@ class PartialEvaluator { continue; } - compareWithLastPosition(); + if (!compareWithLastPosition()) { + // The glyph is not in page so just skip it. + continue; + } // Must be called after compareWithLastPosition because // the textContentItem could have been flushed. @@ -3026,6 +3045,7 @@ class PartialEvaluator { includeMarkedContent, sink: sinkWrapper, seenStyles, + viewBox, }) .then(function () { if (!sinkWrapper.enqueueInvoked) { diff --git a/test/pdfs/bug1755201.pdf.link b/test/pdfs/bug1755201.pdf.link new file mode 100644 index 000000000..54b18c659 --- /dev/null +++ b/test/pdfs/bug1755201.pdf.link @@ -0,0 +1 @@ +https://bugzilla.mozilla.org/attachment.cgi?id=9263657 diff --git a/test/test_manifest.json b/test/test_manifest.json index b9302e61d..41f7f7765 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -1,4 +1,11 @@ [ + { "id": "bug1755201", + "file": "pdfs/bug1755201.pdf", + "md5": "cece14097812d8a1f69e86a51e4a3804", + "rounds": 1, + "link": true, + "type": "other" + }, { "id": "filled-background-range", "file": "pdfs/filled-background.pdf", "md5": "2e3120255d9c3e79b96d2543b12d2589", diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index fc56ae9a2..95cc8461c 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -2219,6 +2219,22 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`) await loadingTask.destroy(); }); + it("gets text content, and check that out-of-page text is not present (bug 1755201)", async function () { + if (isNodeJS) { + pending("Linked test-cases are not supported in Node.js."); + } + + const loadingTask = getDocument(buildGetDocumentParams("bug1755201.pdf")); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(6); + const { items } = await pdfPage.getTextContent(); + const text = mergeText(items); + + expect(/win aisle/.test(text)).toEqual(false); + + await loadingTask.destroy(); + }); + it("gets empty structure tree", async function () { const tree = await page.getStructTree();