From f90f9466e3fdd9741c607cc8ac2b9734bb520767 Mon Sep 17 00:00:00 2001
From: Jonas Jenwald <jonas.jenwald@gmail.com>
Date: Fri, 3 Sep 2021 13:07:04 +0200
Subject: [PATCH] [api-minor] Reduce `postMessage` overhead, in
 `PartialEvaluator.getTextContent`, by sending text chunks in batches (issue
 13962)

Following the STR in the issue, this patch reduces the number of `PartialEvaluator.getTextContent`-related `postMessage`-calls by approximately 78 percent.[1]
Note that by enforcing a relatively low value when batching text chunks, we should thus improve worst-case scenarios while not negatively affect all `textLayer` building.

While working on these changes I noticed, thanks to our unit-tests, that the implementation of the `appendEOL` function unfortunately means that the number and content of the textItems could actually be affected by the particular chunking used.
That seems *extremely* unfortunate, since in practice this means that the particular chunking used is thus observable through the API. Obviously that should be a completely internal implementation detail, which is why this patch also modifies `appendEOL` to mitigate that.[2]

Given that this patch adds a *minimum* batch size in `enqueueChunk`, there's obviously nothing preventing it from becoming a lot larger then the limit (depending e.g. on the PDF structure and the CPU load/speed).
While sending more text chunks at once isn't an issue in itself, it could become problematic at the main-thread during `textLayer` building. Note how both the `PartialEvaluator` and `CanvasGraphics` implementations utilize `Date.now()`-checks, to prevent long-running parsing/rendering from "hanging" the respective thread. In the `textLayer` building we don't utilize such a construction[3], and streaming of textContent is thus essentially acting as a *simple* stand-in for that functionality.
Hence why we want to avoid choosing a too large minimum batch size, since that could thus indirectly affect main-thread performance negatively.

---
[1] While it'd be possible to go even lower, that'd likely require more invasive re-factoring/changes to the `PartialEvaluator.getTextContent`-code to ensure that the batches don't become too large.

[2] This should also, as far as I can tell, explain some of the regressions observed in the "enhance" text-selection tests back in PR 13257.
    Looking closer at the `appendEOL` function it should potentially be changed even more, however that should probably not be done here.

[3] I'd really like to avoid implementing something like that for the `textLayer` building as well, given that it'd require adding a fair bit of complexity.
---
 src/core/evaluator.js | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/src/core/evaluator.js b/src/core/evaluator.js
index 90615b228..3f378222d 100644
--- a/src/core/evaluator.js
+++ b/src/core/evaluator.js
@@ -107,6 +107,17 @@ const PatternType = {
   SHADING: 2,
 };
 
+// Optionally avoid sending individual, or very few, text chunks to reduce
+// `postMessage` overhead with ReadableStream (see issue 13962).
+//
+// PLEASE NOTE: This value should *not* be too large (it's used as a lower limit
+// in `enqueueChunk`), since that would cause streaming of textContent to become
+// essentially useless in practice by sending all (or most) chunks at once.
+// Also, a too large value would (indirectly) affect the main-thread `textLayer`
+// building negatively by forcing all textContent to be handled at once, which
+// could easily end up hurting *overall* performance (e.g. rendering as well).
+const TEXT_CHUNK_BATCH_SIZE = 10;
+
 const deferred = Promise.resolve();
 
 // Convert PDF blend mode names to HTML5 blend mode names.
@@ -2573,8 +2584,6 @@ class PartialEvaluator {
       if (textContentItem.initialized) {
         textContentItem.hasEOL = true;
         flushTextContentItem();
-      } else if (textContent.items.length > 0) {
-        textContent.items[textContent.items.length - 1].hasEOL = true;
       } else {
         textContent.items.push({
           str: "",
@@ -2656,20 +2665,24 @@ class PartialEvaluator {
       textContentItem.str.length = 0;
     }
 
-    function enqueueChunk() {
+    function enqueueChunk(batch = false) {
       const length = textContent.items.length;
-      if (length > 0) {
-        sink.enqueue(textContent, length);
-        textContent.items = [];
-        textContent.styles = Object.create(null);
+      if (length === 0) {
+        return;
       }
+      if (batch && length < TEXT_CHUNK_BATCH_SIZE) {
+        return;
+      }
+      sink.enqueue(textContent, length);
+      textContent.items = [];
+      textContent.styles = Object.create(null);
     }
 
     const timeSlotManager = new TimeSlotManager();
 
     return new Promise(function promiseBody(resolve, reject) {
       const next = function (promise) {
-        enqueueChunk();
+        enqueueChunk(/* batch = */ true);
         Promise.all([promise, sink.ready]).then(function () {
           try {
             promiseBody(resolve, reject);