From 6eef69de2261aa6e34987ec6b6707985de3cf960 Mon Sep 17 00:00:00 2001
From: Jonas Jenwald <jonas.jenwald@gmail.com>
Date: Fri, 7 May 2021 22:25:08 +0200
Subject: [PATCH] Export the "raw" `toUnicode`-data from
 `PartialEvaluator.preEvaluateFont`

Compared to other data-structures, such as e.g. `Dict`s, we're purposely *not* caching Streams on the `XRef`-instance.[1]
The, somewhat unfortunate, effect of Streams not being cached is that repeatedly getting the *same* Stream-data requires re-parsing/re-initializing of a bunch of data; see `XRef.fetch` and related methods.

For the font-parsing in particular we're currently fetching the `toUnicode`-data, which is very often a Stream, in `PartialEvaluator.preEvaluateFont` and then *again* in `PartialEvaluator.extractDataStructures` soon afterwards.
By instead letting `PartialEvaluator.preEvaluateFont` export the "raw" `toUnicode`-data, we can avoid *some* unnecessary re-parsing/re-initializing when handling fonts.
*Please note:* In this particular case, given that `PartialEvaluator.preEvaluateFont` only accesses the "raw" `toUnicode` data, exporting a Stream should be safe.

---
[1] The reasons for this include:
 - Streams, especially `DecodeStream`-instances, can become *very* large once read. Hence caching them really isn't a good idea simply because of the (potential) memory impact of doing so.

 - Attempting to read from the *same* Stream-instance more than once won't work, unless it's `reset` in between, since using any method such as e.g. `getBytes` always starts at the current data position.

 - Given that parsing, even in the worker-thread, is now fairly asynchronous it's generally impossible to assert that any one Stream-instance isn't being accessed "concurrently" by e.g. different `getOperatorList` calls. Hence `reset`-ing a cached Stream-instance isn't going to work in the general case.
---
 src/core/evaluator.js | 44 ++++++++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/src/core/evaluator.js b/src/core/evaluator.js
index d5476347d..f38678ccf 100644
--- a/src/core/evaluator.js
+++ b/src/core/evaluator.js
@@ -2978,10 +2978,9 @@ class PartialEvaluator {
     const xref = this.xref;
     let cidToGidBytes;
     // 9.10.2
-    const toUnicode = dict.get("ToUnicode") || baseDict.get("ToUnicode");
-    const toUnicodePromise = toUnicode
-      ? this.readToUnicode(toUnicode)
-      : Promise.resolve(undefined);
+    const toUnicodePromise = this.readToUnicode(
+      properties.toUnicode || dict.get("ToUnicode") || baseDict.get("ToUnicode")
+    );
 
     if (properties.composite) {
       // CIDSystemInfo helps to match CID to glyphs
@@ -3289,8 +3288,10 @@ class PartialEvaluator {
     );
   }
 
-  readToUnicode(toUnicode) {
-    const cmapObj = toUnicode;
+  readToUnicode(cmapObj) {
+    if (!cmapObj) {
+      return Promise.resolve(null);
+    }
     if (isName(cmapObj)) {
       return CMapFactory.create({
         encoding: cmapObj,
@@ -3541,7 +3542,7 @@ class PartialEvaluator {
     }
 
     let composite = false;
-    let uint8array;
+    let hash, toUnicode;
     if (type.name === "Type0") {
       // If font is a composite
       //  - get the descendant font
@@ -3566,7 +3567,6 @@ class PartialEvaluator {
     const firstChar = dict.get("FirstChar") || 0,
       lastChar = dict.get("LastChar") || (composite ? 0xffff : 0xff);
     const descriptor = dict.get("FontDescriptor");
-    let hash;
     if (descriptor) {
       hash = new MurmurHash3_64();
 
@@ -3601,10 +3601,10 @@ class PartialEvaluator {
 
       hash.update(`${firstChar}-${lastChar}`); // Fixes issue10665_reduced.pdf
 
-      const toUnicode = dict.get("ToUnicode") || baseDict.get("ToUnicode");
+      toUnicode = dict.get("ToUnicode") || baseDict.get("ToUnicode");
       if (isStream(toUnicode)) {
         const stream = toUnicode.str || toUnicode;
-        uint8array = stream.buffer
+        const uint8array = stream.buffer
           ? new Uint8Array(stream.buffer.buffer, 0, stream.bufferLength)
           : new Uint8Array(
               stream.bytes.buffer,
@@ -3659,18 +3659,22 @@ class PartialEvaluator {
       type: type.name,
       firstChar,
       lastChar,
+      toUnicode,
       hash: hash ? hash.hexdigest() : "",
     };
   }
 
-  async translateFont(preEvaluatedFont) {
-    const baseDict = preEvaluatedFont.baseDict;
-    const dict = preEvaluatedFont.dict;
-    const composite = preEvaluatedFont.composite;
-    let descriptor = preEvaluatedFont.descriptor;
-    const type = preEvaluatedFont.type;
-    const firstChar = preEvaluatedFont.firstChar,
-      lastChar = preEvaluatedFont.lastChar;
+  async translateFont({
+    descriptor,
+    dict,
+    baseDict,
+    composite,
+    type,
+    firstChar,
+    lastChar,
+    toUnicode,
+    cssFontInfo,
+  }) {
     let properties;
 
     if (!descriptor) {
@@ -3710,6 +3714,7 @@ class PartialEvaluator {
           flags,
           firstChar,
           lastChar,
+          toUnicode,
         };
         const widths = dict.get("Widths");
         return this.extractDataStructures(dict, dict, properties).then(
@@ -3806,6 +3811,7 @@ class PartialEvaluator {
       fontMatrix: dict.getArray("FontMatrix") || FONT_IDENTITY_MATRIX,
       firstChar,
       lastChar,
+      toUnicode,
       bbox: descriptor.getArray("FontBBox"),
       ascent: descriptor.get("Ascent"),
       descent: descriptor.get("Descent"),
@@ -3814,7 +3820,7 @@ class PartialEvaluator {
       flags: descriptor.get("Flags"),
       italicAngle: descriptor.get("ItalicAngle"),
       isType3Font: false,
-      cssFontInfo: preEvaluatedFont.cssFontInfo,
+      cssFontInfo,
     };
 
     if (composite) {