diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 912538af8..748914b76 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -2341,6 +2341,12 @@ class PartialEvaluator { const SPACE_IN_FLOW_MIN_FACTOR = 0.102; const SPACE_IN_FLOW_MAX_FACTOR = 0.6; + // If a char is too high/too low compared to the previous we just create + // a new chunk. + // If the advance isn't in the +/-VERTICAL_SHIFT_RATIO * height range then + // a new chunk is created. + const VERTICAL_SHIFT_RATIO = 0.25; + const self = this; const xref = this.xref; const showSpacedTextBuffer = []; @@ -2649,6 +2655,10 @@ class PartialEvaluator { } } + if (Math.abs(advanceX) > textContentItem.width * VERTICAL_SHIFT_RATIO) { + flushTextContentItem(); + } + return true; } @@ -2706,6 +2716,10 @@ class PartialEvaluator { } } + if (Math.abs(advanceY) > textContentItem.height * VERTICAL_SHIFT_RATIO) { + flushTextContentItem(); + } + return true; } diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 679f1e97b..7756d4512 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -581,3 +581,4 @@ !issue16063.pdf !issue16067.pdf !bug1820909.1.pdf +!issue16221.pdf diff --git a/test/pdfs/issue16221.pdf b/test/pdfs/issue16221.pdf new file mode 100755 index 000000000..1334ac774 Binary files /dev/null and b/test/pdfs/issue16221.pdf differ diff --git a/test/test_manifest.json b/test/test_manifest.json index e88f2975f..ba6324c3c 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -7510,5 +7510,12 @@ "md5": "f71e89ebe3d6e75e0c83ce41cd72df1f", "link": true, "type": "other" + }, + { + "id": "issue16221-text", + "file": "pdfs/issue16221.pdf", + "md5": "62d93c9b3aa4ba3af5446504632e78a5", + "rounds": 1, + "type": "text" } ] diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 4c157c3b5..0aaa0d47c 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -2624,6 +2624,17 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`) await loadingTask.destroy(); }); + it("gets text content with a rised text", async function () { + const loadingTask = getDocument(buildGetDocumentParams("issue16221.pdf")); + const pdfDoc = await loadingTask.promise; + const pdfPage = await pdfDoc.getPage(1); + const { items } = await pdfPage.getTextContent(); + + expect(items.map(i => i.str)).toEqual(["Hello ", "World"]); + + await loadingTask.destroy(); + }); + it("gets empty structure tree", async function () { const tree = await page.getStructTree();