From 3789dab307eae0a586c8da2f8c5a34b761a196a8 Mon Sep 17 00:00:00 2001 From: Calixte Denizet Date: Sat, 25 Jun 2022 16:40:46 +0200 Subject: [PATCH] Always flush the current item with MarkedContent stuff when getting text (#15094) --- src/core/evaluator.js | 5 +++-- test/unit/api_spec.js | 26 +++++++++++++++++++++++++- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 4250d9f6f..cda5f7df7 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -3290,6 +3290,7 @@ class PartialEvaluator { ); return; case OPS.beginMarkedContent: + flushTextContentItem(); if (includeMarkedContent) { textContent.items.push({ type: "beginMarkedContent", @@ -3298,8 +3299,8 @@ class PartialEvaluator { } break; case OPS.beginMarkedContentProps: + flushTextContentItem(); if (includeMarkedContent) { - flushTextContentItem(); let mcid = null; if (args[1] instanceof Dict) { mcid = args[1].get("MCID"); @@ -3314,8 +3315,8 @@ class PartialEvaluator { } break; case OPS.endMarkedContent: + flushTextContentItem(); if (includeMarkedContent) { - flushTextContentItem(); textContent.items.push({ type: "endMarkedContent", }); diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 40ffdfcd8..e9eb7c59e 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -78,7 +78,9 @@ describe("api", function () { } function mergeText(items) { - return items.map(chunk => chunk.str + (chunk.hasEOL ? "\n" : "")).join(""); + return items + .map(chunk => (chunk.str ?? "") + (chunk.hasEOL ? "\n" : "")) + .join(""); } describe("getDocument", function () { @@ -2275,6 +2277,28 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`) await loadingTask.destroy(); }); + it("gets text content with or without includeMarkedContent, and compare (issue 15094)", async function () { + if (isNodeJS) { + pending("Linked test-cases are not supported in Node.js."); + } + + const loadingTask = getDocument(buildGetDocumentParams("pdf.pdf")); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(568); + let { items } = await pdfPage.getTextContent({ + includeMarkedContent: false, + }); + const textWithoutMC = mergeText(items); + ({ items } = await pdfPage.getTextContent({ + includeMarkedContent: true, + })); + const textWithMC = mergeText(items); + + expect(textWithoutMC).toEqual(textWithMC); + + await loadingTask.destroy(); + }); + it("gets empty structure tree", async function () { const tree = await page.getStructTree();