From 5063a6f2a9281fcf832d0e87abdacae2306b55d0 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Thu, 30 Mar 2023 13:36:42 +0200 Subject: [PATCH] [api-minor] Remove the `disableCombineTextItems` option *Please note:* This parameter has never been used within the PDF.js library/viewer itself, and it was only ever added for backwards compatibility reasons. This parameter was added in PR 7475, over six years ago, to try and optionally maintain the previous *default* text-extraction behaviour. However as part of the general text-extraction improvements in PR 13257, almost two years ago, the `disableCombineTextItems` functionality was accidentally "broken" in various ways. Note how the only (very basic) unit-test was updated in a way that doesn't really make sense, since generally speaking you'd expect that using the option should result in *more* (or at least the same number of) text-items. Furthermore there's also the recent issue 16209, where the option causes almost all textContent to be concatenated together. Hence this patch proposes that we simply remove the `disableCombineTextItems` option since it's essentially unused/untested functionality, as evident from the fact that it took almost two years for someone to notice that it's broken. --- src/core/annotation.js | 1 - src/core/document.js | 9 +-------- src/core/evaluator.js | 8 +------- src/core/worker.js | 5 ++--- src/display/api.js | 8 +------- test/unit/api_spec.js | 21 ++++++--------------- 6 files changed, 11 insertions(+), 41 deletions(-) diff --git a/src/core/annotation.js b/src/core/annotation.js index abd5eddcb..b3f53460d 100644 --- a/src/core/annotation.js +++ b/src/core/annotation.js @@ -1010,7 +1010,6 @@ class Annotation { task, resources, includeMarkedContent: true, - combineTextItems: true, sink, viewBox, }); diff --git a/src/core/document.js b/src/core/document.js index 54f70655d..6e4ca7cff 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -511,13 +511,7 @@ class Page { }); } - extractTextContent({ - handler, - task, - includeMarkedContent, - sink, - combineTextItems, - }) { + extractTextContent({ handler, task, includeMarkedContent, sink }) { const contentStreamPromise = this.getContentStream(); const resourcesPromise = this.loadResources([ "ExtGState", @@ -545,7 +539,6 @@ class Page { task, resources: this.resources, includeMarkedContent, - combineTextItems, sink, viewBox: this.view, }); diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 748914b76..4058885e2 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -2236,7 +2236,6 @@ class PartialEvaluator { task, resources, stateManager = null, - combineTextItems = false, includeMarkedContent = false, sink, seenStyles = new Set(), @@ -2534,11 +2533,7 @@ class PartialEvaluator { return false; } - if ( - !combineTextItems || - !textState.font || - !textContentItem.prevTransform - ) { + if (!textState.font || !textContentItem.prevTransform) { return true; } @@ -3191,7 +3186,6 @@ class PartialEvaluator { task, resources: xobj.dict.get("Resources") || resources, stateManager: xObjStateManager, - combineTextItems, includeMarkedContent, sink: sinkWrapper, seenStyles, diff --git a/src/core/worker.js b/src/core/worker.js index 471895c32..46efcca47 100644 --- a/src/core/worker.js +++ b/src/core/worker.js @@ -741,7 +741,7 @@ class WorkerMessageHandler { }); handler.on("GetTextContent", function (data, sink) { - const pageIndex = data.pageIndex; + const { pageIndex, includeMarkedContent } = data; pdfManager.getPage(pageIndex).then(function (page) { const task = new WorkerTask("GetTextContent: page " + pageIndex); @@ -755,8 +755,7 @@ class WorkerMessageHandler { handler, task, sink, - includeMarkedContent: data.includeMarkedContent, - combineTextItems: data.combineTextItems, + includeMarkedContent, }) .then( function () { diff --git a/src/display/api.js b/src/display/api.js index e206eed9b..28b86dac5 100644 --- a/src/display/api.js +++ b/src/display/api.js @@ -1120,8 +1120,6 @@ class PDFDocumentProxy { * Page getTextContent parameters. * * @typedef {Object} getTextContentParameters - * @property {boolean} disableCombineTextItems - Do not attempt to combine - * same line {@link TextItem}'s. The default value is `false`. * @property {boolean} [includeMarkedContent] - When true include marked * content items in the items array of TextContent. The default is `false`. */ @@ -1602,17 +1600,13 @@ class PDFPageProxy { * @param {getTextContentParameters} params - getTextContent parameters. * @returns {ReadableStream} Stream for reading text content chunks. */ - streamTextContent({ - disableCombineTextItems = false, - includeMarkedContent = false, - } = {}) { + streamTextContent({ includeMarkedContent = false } = {}) { const TEXT_CONTENT_CHUNK_SIZE = 100; return this._transport.messageHandler.sendWithStream( "GetTextContent", { pageIndex: this._pageIndex, - combineTextItems: disableCombineTextItems !== true, includeMarkedContent: includeMarkedContent === true, }, { diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 0aaa0d47c..ce46d4ef7 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -21,6 +21,7 @@ import { ImageKind, InvalidPDFException, MissingPDFException, + objectSize, OPS, PasswordException, PasswordResponses, @@ -2321,26 +2322,16 @@ describe("api", function () { }); it("gets text content", async function () { - const defaultPromise = page.getTextContent(); - const parametersPromise = page.getTextContent({ - disableCombineTextItems: true, - }); + const { items, styles } = await page.getTextContent(); - const data = await Promise.all([defaultPromise, parametersPromise]); + expect(items.length).toEqual(15); + expect(objectSize(styles)).toEqual(5); - expect(!!data[0].items).toEqual(true); - expect(data[0].items.length).toEqual(15); - expect(!!data[0].styles).toEqual(true); - - const page1 = mergeText(data[0].items); - expect(page1).toEqual(`Table Of Content + const text = mergeText(items); + expect(text).toEqual(`Table Of Content Chapter 1 .......................................................... 2 Paragraph 1.1 ...................................................... 3 page 1 / 3`); - - expect(!!data[1].items).toEqual(true); - expect(data[1].items.length).toEqual(6); - expect(!!data[1].styles).toEqual(true); }); it("gets text content, with correct properties (issue 8276)", async function () {