From 403baa7bbab555c2a84e6787bc3dc2ab49724467 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Mon, 31 Jan 2022 17:48:35 +0100 Subject: [PATCH] [api-minor] Remove the `normalizeWhitespace` option in the `PDFPageProxy.{getTextContent, streamTextContent}` methods (issue 14519, PR 14428 follow-up) With these changes, we'll now *always* replace all whitespaces with standard spaces (0x20). This behaviour is already, since many years, the default in both the viewer and the browser-tests. --- src/core/document.js | 2 -- src/core/evaluator.js | 4 +--- src/core/worker.js | 1 - src/display/api.js | 10 ++++++---- test/driver.js | 1 - test/unit/api_spec.js | 1 - web/pdf_find_controller.js | 4 +--- web/pdf_page_view.js | 1 - 8 files changed, 8 insertions(+), 16 deletions(-) diff --git a/src/core/document.js b/src/core/document.js index 19d2e66b4..848e05512 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -438,7 +438,6 @@ class Page { extractTextContent({ handler, task, - normalizeWhitespace, includeMarkedContent, sink, combineTextItems, @@ -469,7 +468,6 @@ class Page { stream: contentStream, task, resources: this.resources, - normalizeWhitespace, includeMarkedContent, combineTextItems, sink, diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 3caf1929d..e78116898 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -2163,7 +2163,6 @@ class PartialEvaluator { task, resources, stateManager = null, - normalizeWhitespace = false, combineTextItems = false, includeMarkedContent = false, sink, @@ -2642,7 +2641,7 @@ class PartialEvaluator { textChunk.prevTransform = getCurrentTextTransform(); } - if (glyph.isWhitespace && normalizeWhitespace) { + if (glyph.isWhitespace) { // Replaces all whitespaces with standard spaces (0x20), to avoid // alignment issues between the textLayer and the canvas if the text // contains e.g. tabs (fixes issue6612.pdf). @@ -3023,7 +3022,6 @@ class PartialEvaluator { task, resources: xobj.dict.get("Resources") || resources, stateManager: xObjStateManager, - normalizeWhitespace, combineTextItems, includeMarkedContent, sink: sinkWrapper, diff --git a/src/core/worker.js b/src/core/worker.js index 328ffb7f2..08b4eae20 100644 --- a/src/core/worker.js +++ b/src/core/worker.js @@ -740,7 +740,6 @@ class WorkerMessageHandler { handler, task, sink, - normalizeWhitespace: data.normalizeWhitespace, includeMarkedContent: data.includeMarkedContent, combineTextItems: data.combineTextItems, }) diff --git a/src/display/api.js b/src/display/api.js index 67b3dd178..415ca9e2d 100644 --- a/src/display/api.js +++ b/src/display/api.js @@ -1069,8 +1069,6 @@ class PDFDocumentProxy { * Page getTextContent parameters. * * @typedef {Object} getTextContentParameters - * @property {boolean} normalizeWhitespace - Replaces all occurrences of - * whitespace with standard spaces (0x20). The default value is `false`. * @property {boolean} disableCombineTextItems - Do not attempt to combine * same line {@link TextItem}'s. The default value is `false`. * @property {boolean} [includeMarkedContent] - When true include marked @@ -1585,11 +1583,13 @@ class PDFPageProxy { } /** + * NOTE: All occurrences of whitespace will be replaced by + * standard spaces (0x20). + * * @param {getTextContentParameters} params - getTextContent parameters. * @returns {ReadableStream} Stream for reading text content chunks. */ streamTextContent({ - normalizeWhitespace = false, disableCombineTextItems = false, includeMarkedContent = false, } = {}) { @@ -1599,7 +1599,6 @@ class PDFPageProxy { "GetTextContent", { pageIndex: this._pageIndex, - normalizeWhitespace: normalizeWhitespace === true, combineTextItems: disableCombineTextItems !== true, includeMarkedContent: includeMarkedContent === true, }, @@ -1613,6 +1612,9 @@ class PDFPageProxy { } /** + * NOTE: All occurrences of whitespace will be replaced by + * standard spaces (0x20). + * * @param {getTextContentParameters} params - getTextContent parameters. * @returns {Promise} A promise that is resolved with a * {@link TextContent} object that represents the page's text content. diff --git a/test/driver.js b/test/driver.js index 52d933889..e92bf829d 100644 --- a/test/driver.js +++ b/test/driver.js @@ -644,7 +644,6 @@ class Driver { // The text builder will draw its content on the test canvas initPromise = page .getTextContent({ - normalizeWhitespace: true, includeMarkedContent: true, }) .then(function (textContent) { diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 293c4563c..24ca5fea3 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -1966,7 +1966,6 @@ describe("api", function () { it("gets text content", async function () { const defaultPromise = page.getTextContent(); const parametersPromise = page.getTextContent({ - normalizeWhitespace: true, disableCombineTextItems: true, }); diff --git a/web/pdf_find_controller.js b/web/pdf_find_controller.js index 1f0008b35..49677e15f 100644 --- a/web/pdf_find_controller.js +++ b/web/pdf_find_controller.js @@ -551,9 +551,7 @@ class PDFFindController { return this._pdfDocument .getPage(i + 1) .then(pdfPage => { - return pdfPage.getTextContent({ - normalizeWhitespace: true, - }); + return pdfPage.getTextContent(); }) .then( textContent => { diff --git a/web/pdf_page_view.js b/web/pdf_page_view.js index 40e454fc8..f7e7dad3d 100644 --- a/web/pdf_page_view.js +++ b/web/pdf_page_view.js @@ -701,7 +701,6 @@ class PDFPageView { return finishPaintTask(null).then(() => { if (textLayer) { const readableStream = pdfPage.streamTextContent({ - normalizeWhitespace: true, includeMarkedContent: true, }); textLayer.setTextContentStream(readableStream);