From 403baa7bbab555c2a84e6787bc3dc2ab49724467 Mon Sep 17 00:00:00 2001
From: Jonas Jenwald <jonas.jenwald@gmail.com>
Date: Mon, 31 Jan 2022 17:48:35 +0100
Subject: [PATCH] [api-minor] Remove the `normalizeWhitespace` option in the
 `PDFPageProxy.{getTextContent, streamTextContent}` methods (issue 14519, PR
 14428 follow-up)

With these changes, we'll now *always* replace all whitespaces with standard spaces (0x20). This behaviour is already, since many years, the default in both the viewer and the browser-tests.
---
 src/core/document.js       |  2 --
 src/core/evaluator.js      |  4 +---
 src/core/worker.js         |  1 -
 src/display/api.js         | 10 ++++++----
 test/driver.js             |  1 -
 test/unit/api_spec.js      |  1 -
 web/pdf_find_controller.js |  4 +---
 web/pdf_page_view.js       |  1 -
 8 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/src/core/document.js b/src/core/document.js
index 19d2e66b4..848e05512 100644
--- a/src/core/document.js
+++ b/src/core/document.js
@@ -438,7 +438,6 @@ class Page {
   extractTextContent({
     handler,
     task,
-    normalizeWhitespace,
     includeMarkedContent,
     sink,
     combineTextItems,
@@ -469,7 +468,6 @@ class Page {
         stream: contentStream,
         task,
         resources: this.resources,
-        normalizeWhitespace,
         includeMarkedContent,
         combineTextItems,
         sink,
diff --git a/src/core/evaluator.js b/src/core/evaluator.js
index 3caf1929d..e78116898 100644
--- a/src/core/evaluator.js
+++ b/src/core/evaluator.js
@@ -2163,7 +2163,6 @@ class PartialEvaluator {
     task,
     resources,
     stateManager = null,
-    normalizeWhitespace = false,
     combineTextItems = false,
     includeMarkedContent = false,
     sink,
@@ -2642,7 +2641,7 @@ class PartialEvaluator {
           textChunk.prevTransform = getCurrentTextTransform();
         }
 
-        if (glyph.isWhitespace && normalizeWhitespace) {
+        if (glyph.isWhitespace) {
           // Replaces all whitespaces with standard spaces (0x20), to avoid
           // alignment issues between the textLayer and the canvas if the text
           // contains e.g. tabs (fixes issue6612.pdf).
@@ -3023,7 +3022,6 @@ class PartialEvaluator {
                     task,
                     resources: xobj.dict.get("Resources") || resources,
                     stateManager: xObjStateManager,
-                    normalizeWhitespace,
                     combineTextItems,
                     includeMarkedContent,
                     sink: sinkWrapper,
diff --git a/src/core/worker.js b/src/core/worker.js
index 328ffb7f2..08b4eae20 100644
--- a/src/core/worker.js
+++ b/src/core/worker.js
@@ -740,7 +740,6 @@ class WorkerMessageHandler {
             handler,
             task,
             sink,
-            normalizeWhitespace: data.normalizeWhitespace,
             includeMarkedContent: data.includeMarkedContent,
             combineTextItems: data.combineTextItems,
           })
diff --git a/src/display/api.js b/src/display/api.js
index 67b3dd178..415ca9e2d 100644
--- a/src/display/api.js
+++ b/src/display/api.js
@@ -1069,8 +1069,6 @@ class PDFDocumentProxy {
  * Page getTextContent parameters.
  *
  * @typedef {Object} getTextContentParameters
- * @property {boolean} normalizeWhitespace - Replaces all occurrences of
- *   whitespace with standard spaces (0x20). The default value is `false`.
  * @property {boolean} disableCombineTextItems - Do not attempt to combine
  *   same line {@link TextItem}'s. The default value is `false`.
  * @property {boolean} [includeMarkedContent] - When true include marked
@@ -1585,11 +1583,13 @@ class PDFPageProxy {
   }
 
   /**
+   * NOTE: All occurrences of whitespace will be replaced by
+   * standard spaces (0x20).
+   *
    * @param {getTextContentParameters} params - getTextContent parameters.
    * @returns {ReadableStream} Stream for reading text content chunks.
    */
   streamTextContent({
-    normalizeWhitespace = false,
     disableCombineTextItems = false,
     includeMarkedContent = false,
   } = {}) {
@@ -1599,7 +1599,6 @@ class PDFPageProxy {
       "GetTextContent",
       {
         pageIndex: this._pageIndex,
-        normalizeWhitespace: normalizeWhitespace === true,
         combineTextItems: disableCombineTextItems !== true,
         includeMarkedContent: includeMarkedContent === true,
       },
@@ -1613,6 +1612,9 @@ class PDFPageProxy {
   }
 
   /**
+   * NOTE: All occurrences of whitespace will be replaced by
+   * standard spaces (0x20).
+   *
    * @param {getTextContentParameters} params - getTextContent parameters.
    * @returns {Promise<TextContent>} A promise that is resolved with a
    *   {@link TextContent} object that represents the page's text content.
diff --git a/test/driver.js b/test/driver.js
index 52d933889..e92bf829d 100644
--- a/test/driver.js
+++ b/test/driver.js
@@ -644,7 +644,6 @@ class Driver {
               // The text builder will draw its content on the test canvas
               initPromise = page
                 .getTextContent({
-                  normalizeWhitespace: true,
                   includeMarkedContent: true,
                 })
                 .then(function (textContent) {
diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js
index 293c4563c..24ca5fea3 100644
--- a/test/unit/api_spec.js
+++ b/test/unit/api_spec.js
@@ -1966,7 +1966,6 @@ describe("api", function () {
     it("gets text content", async function () {
       const defaultPromise = page.getTextContent();
       const parametersPromise = page.getTextContent({
-        normalizeWhitespace: true,
         disableCombineTextItems: true,
       });
 
diff --git a/web/pdf_find_controller.js b/web/pdf_find_controller.js
index 1f0008b35..49677e15f 100644
--- a/web/pdf_find_controller.js
+++ b/web/pdf_find_controller.js
@@ -551,9 +551,7 @@ class PDFFindController {
         return this._pdfDocument
           .getPage(i + 1)
           .then(pdfPage => {
-            return pdfPage.getTextContent({
-              normalizeWhitespace: true,
-            });
+            return pdfPage.getTextContent();
           })
           .then(
             textContent => {
diff --git a/web/pdf_page_view.js b/web/pdf_page_view.js
index 40e454fc8..f7e7dad3d 100644
--- a/web/pdf_page_view.js
+++ b/web/pdf_page_view.js
@@ -701,7 +701,6 @@ class PDFPageView {
         return finishPaintTask(null).then(() => {
           if (textLayer) {
             const readableStream = pdfPage.streamTextContent({
-              normalizeWhitespace: true,
               includeMarkedContent: true,
             });
             textLayer.setTextContentStream(readableStream);