Export the "raw" toUnicode-data from PartialEvaluator.preEvaluateFont

Compared to other data-structures, such as e.g. `Dict`s, we're purposely *not* caching Streams on the `XRef`-instance.[1] The, somewhat unfortunate, effect of Streams not being cached is that repeatedly getting the *same* Stream-data requires re-parsing/re-initializing of a bunch of data; see `XRef.fetch` and related methods. For the font-parsing in particular we're currently fetching the `toUnicode`-data, which is very often a Stream, in `PartialEvaluator.preEvaluateFont` and then *again* in `PartialEvaluator.extractDataStructures` soon afterwards. By instead letting `PartialEvaluator.preEvaluateFont` export the "raw" `toUnicode`-data, we can avoid *some* unnecessary re-parsing/re-initializing when handling fonts. *Please note:* In this particular case, given that `PartialEvaluator.preEvaluateFont` only accesses the "raw" `toUnicode` data, exporting a Stream should be safe. --- [1] The reasons for this include: - Streams, especially `DecodeStream`-instances, can become *very* large once read. Hence caching them really isn't a good idea simply because of the (potential) memory impact of doing so. - Attempting to read from the *same* Stream-instance more than once won't work, unless it's `reset` in between, since using any method such as e.g. `getBytes` always starts at the current data position. - Given that parsing, even in the worker-thread, is now fairly asynchronous it's generally impossible to assert that any one Stream-instance isn't being accessed "concurrently" by e.g. different `getOperatorList` calls. Hence `reset`-ing a cached Stream-instance isn't going to work in the general case.
2021-05-07 22:25:08 +02:00 · 2021-05-07 22:25:08 +02:00 · 6eef69de22
commit 6eef69de22
parent 13fb1654dc
1 changed files with 25 additions and 19 deletions
--- a/src/core/evaluator.js
+++ b/src/core/evaluator.js
@ -2978,10 +2978,9 @@ class PartialEvaluator {
    const xref = this.xref;
    let cidToGidBytes;
    // 9.10.2
-    const toUnicode = dict.get("ToUnicode") || baseDict.get("ToUnicode");
-    const toUnicodePromise = toUnicode
-      ? this.readToUnicode(toUnicode)
-      : Promise.resolve(undefined);
+    const toUnicodePromise = this.readToUnicode(
+      properties.toUnicode || dict.get("ToUnicode") || baseDict.get("ToUnicode")
+    );

    if (properties.composite) {
      // CIDSystemInfo helps to match CID to glyphs
@ -3289,8 +3288,10 @@ class PartialEvaluator {
    );
  }

-  readToUnicode(toUnicode) {
-    const cmapObj = toUnicode;
+  readToUnicode(cmapObj) {
+    if (!cmapObj) {
+      return Promise.resolve(null);
+    }
    if (isName(cmapObj)) {
      return CMapFactory.create({
        encoding: cmapObj,
@ -3541,7 +3542,7 @@ class PartialEvaluator {
    }

    let composite = false;
-    let uint8array;
+    let hash, toUnicode;
    if (type.name === "Type0") {
      // If font is a composite
      //  - get the descendant font
@ -3566,7 +3567,6 @@ class PartialEvaluator {
    const firstChar = dict.get("FirstChar") || 0,
      lastChar = dict.get("LastChar") || (composite ? 0xffff : 0xff);
    const descriptor = dict.get("FontDescriptor");
-    let hash;
    if (descriptor) {
      hash = new MurmurHash3_64();

@ -3601,10 +3601,10 @@ class PartialEvaluator {

      hash.update(`${firstChar}-${lastChar}`); // Fixes issue10665_reduced.pdf

-      const toUnicode = dict.get("ToUnicode") || baseDict.get("ToUnicode");
+      toUnicode = dict.get("ToUnicode") || baseDict.get("ToUnicode");
      if (isStream(toUnicode)) {
        const stream = toUnicode.str || toUnicode;
-        uint8array = stream.buffer
+        const uint8array = stream.buffer
          ? new Uint8Array(stream.buffer.buffer, 0, stream.bufferLength)
          : new Uint8Array(
              stream.bytes.buffer,
@ -3659,18 +3659,22 @@ class PartialEvaluator {
      type: type.name,
      firstChar,
      lastChar,
+      toUnicode,
      hash: hash ? hash.hexdigest() : "",
    };
  }

-  async translateFont(preEvaluatedFont) {
-    const baseDict = preEvaluatedFont.baseDict;
-    const dict = preEvaluatedFont.dict;
-    const composite = preEvaluatedFont.composite;
-    let descriptor = preEvaluatedFont.descriptor;
-    const type = preEvaluatedFont.type;
-    const firstChar = preEvaluatedFont.firstChar,
-      lastChar = preEvaluatedFont.lastChar;
+  async translateFont({
+    descriptor,
+    dict,
+    baseDict,
+    composite,
+    type,
+    firstChar,
+    lastChar,
+    toUnicode,
+    cssFontInfo,
+  }) {
    let properties;

    if (!descriptor) {
@ -3710,6 +3714,7 @@ class PartialEvaluator {
          flags,
          firstChar,
          lastChar,
+          toUnicode,
        };
        const widths = dict.get("Widths");
        return this.extractDataStructures(dict, dict, properties).then(
@ -3806,6 +3811,7 @@ class PartialEvaluator {
      fontMatrix: dict.getArray("FontMatrix") || FONT_IDENTITY_MATRIX,
      firstChar,
      lastChar,
+      toUnicode,
      bbox: descriptor.getArray("FontBBox"),
      ascent: descriptor.get("Ascent"),
      descent: descriptor.get("Descent"),
@ -3814,7 +3820,7 @@ class PartialEvaluator {
      flags: descriptor.get("Flags"),
      italicAngle: descriptor.get("ItalicAngle"),
      isType3Font: false,
-      cssFontInfo: preEvaluatedFont.cssFontInfo,
+      cssFontInfo,
    };

    if (composite) {