Merge pull request #13393 from Snuffleupagus/adjustToUnicode-hasIncludedToUnicodeMap

Tweak `adjustToUnicode` to allow extending a built-in /ToUnicode map
2021-06-16 17:06:17 +02:00 · 2021-06-16 17:06:17 +02:00 · 7fa61c062c
commit 7fa61c062c
parent f9a0568f96 229a49b9b9
2 changed files with 71 additions and 55 deletions
--- a/src/core/evaluator.js
+++ b/src/core/evaluator.js
@ -3178,10 +3178,10 @@ class PartialEvaluator {
  }

  /**
-   * @returns {ToUnicodeMap}
+   * @returns {Array}
   * @private
   */
-  _buildSimpleFontToUnicode(properties, forceGlyphs = false) {
+  _simpleFontToUnicode(properties, forceGlyphs = false) {
    assert(!properties.composite, "Must be a simple font.");

    const toUnicode = [];
@ -3242,7 +3242,7 @@ class PartialEvaluator {
                Number.isNaN(code) &&
                Number.isInteger(parseInt(codeStr, 16))
              ) {
-                return this._buildSimpleFontToUnicode(
+                return this._simpleFontToUnicode(
                  properties,
                  /* forceGlyphs */ true
                );
@ -3275,7 +3275,7 @@ class PartialEvaluator {
      }
      toUnicode[charcode] = String.fromCharCode(glyphsUnicodeMap[glyphName]);
    }
-    return new ToUnicodeMap(toUnicode);
+    return toUnicode;
  }

  /**
@ -3284,7 +3284,7 @@ class PartialEvaluator {
   * @returns {Promise} A Promise that is resolved with a
   *   {ToUnicodeMap|IdentityToUnicodeMap} object.
   */
-  buildToUnicode(properties) {
+  async buildToUnicode(properties) {
    properties.hasIncludedToUnicodeMap =
      !!properties.toUnicode && properties.toUnicode.length > 0;

@ -3294,11 +3294,9 @@ class PartialEvaluator {
      // text-extraction. For simple fonts, containing encoding information,
      // use a fallback ToUnicode map to improve this (fixes issue8229.pdf).
      if (!properties.composite && properties.hasEncoding) {
-        properties.fallbackToUnicode =
-          this._buildSimpleFontToUnicode(properties);
+        properties.fallbackToUnicode = this._simpleFontToUnicode(properties);
      }
-
-      return Promise.resolve(properties.toUnicode);
+      return properties.toUnicode;
    }

    // According to the spec if the font is a simple font we should only map
@ -3307,7 +3305,7 @@ class PartialEvaluator {
    // in pratice it seems better to always try to create a toUnicode map
    // based of the default encoding.
    if (!properties.composite /* is simple font */) {
-      return Promise.resolve(this._buildSimpleFontToUnicode(properties));
+      return new ToUnicodeMap(this._simpleFontToUnicode(properties));
    }

    // If the font is a composite font that uses one of the predefined CMaps
@ -3330,42 +3328,37 @@ class PartialEvaluator {
      // b) Obtain the registry and ordering of the character collection used
      // by the font’s CMap (for example, Adobe and Japan1) from its
      // CIDSystemInfo dictionary.
-      const registry = properties.cidSystemInfo.registry;
-      const ordering = properties.cidSystemInfo.ordering;
+      const { registry, ordering } = properties.cidSystemInfo;
      // c) Construct a second CMap name by concatenating the registry and
      // ordering obtained in step (b) in the format registry–ordering–UCS2
      // (for example, Adobe–Japan1–UCS2).
-      const ucs2CMapName = Name.get(registry + "-" + ordering + "-UCS2");
+      const ucs2CMapName = Name.get(`${registry}-${ordering}-UCS2`);
      // d) Obtain the CMap with the name constructed in step (c) (available
      // from the ASN Web site; see the Bibliography).
-      return CMapFactory.create({
+      const ucs2CMap = await CMapFactory.create({
        encoding: ucs2CMapName,
        fetchBuiltInCMap: this._fetchBuiltInCMapBound,
        useCMap: null,
-      }).then(function (ucs2CMap) {
-        const cMap = properties.cMap;
-        const toUnicode = [];
-        cMap.forEach(function (charcode, cid) {
-          if (cid > 0xffff) {
-            throw new FormatError("Max size of CID is 65,535");
-          }
-          // e) Map the CID obtained in step (a) according to the CMap
-          // obtained in step (d), producing a Unicode value.
-          const ucs2 = ucs2CMap.lookup(cid);
-          if (ucs2) {
-            toUnicode[charcode] = String.fromCharCode(
-              (ucs2.charCodeAt(0) << 8) + ucs2.charCodeAt(1)
-            );
-          }
-        });
-        return new ToUnicodeMap(toUnicode);
      });
+      const toUnicode = [];
+      properties.cMap.forEach(function (charcode, cid) {
+        if (cid > 0xffff) {
+          throw new FormatError("Max size of CID is 65,535");
+        }
+        // e) Map the CID obtained in step (a) according to the CMap
+        // obtained in step (d), producing a Unicode value.
+        const ucs2 = ucs2CMap.lookup(cid);
+        if (ucs2) {
+          toUnicode[charcode] = String.fromCharCode(
+            (ucs2.charCodeAt(0) << 8) + ucs2.charCodeAt(1)
+          );
+        }
+      });
+      return new ToUnicodeMap(toUnicode);
    }

    // The viewer's choice, just use an identity map.
-    return Promise.resolve(
-      new IdentityToUnicodeMap(properties.firstChar, properties.lastChar)
-    );
+    return new IdentityToUnicodeMap(properties.firstChar, properties.lastChar);
  }

  readToUnicode(cmapObj) {
--- a/src/core/fonts.js
+++ b/src/core/fonts.js
@ -135,9 +135,6 @@ function adjustToUnicode(properties, builtInEncoding) {
  if (properties.isInternalFont) {
    return;
  }
-  if (properties.hasIncludedToUnicodeMap) {
-    return; // The font dictionary has a `ToUnicode` entry.
-  }
  if (builtInEncoding === properties.defaultEncoding) {
    return; // No point in trying to adjust `toUnicode` if the encodings match.
  }
@ -147,11 +144,17 @@ function adjustToUnicode(properties, builtInEncoding) {
  const toUnicode = [],
    glyphsUnicodeMap = getGlyphsUnicode();
  for (const charCode in builtInEncoding) {
-    if (
-      properties.hasEncoding &&
-      properties.differences[charCode] !== undefined
-    ) {
-      continue; // The font dictionary has an `Encoding`/`Differences` entry.
+    if (properties.hasIncludedToUnicodeMap) {
+      if (properties.toUnicode.has(charCode)) {
+        continue; // The font dictionary has a `ToUnicode` entry.
+      }
+    } else {
+      if (
+        properties.hasEncoding &&
+        properties.differences[charCode] !== undefined
+      ) {
+        continue; // The font dictionary has an `Encoding`/`Differences` entry.
+      }
    }
    const glyphName = builtInEncoding[charCode];
    const unicode = getUnicodeForGlyph(glyphName, glyphsUnicodeMap);
@ -159,7 +162,32 @@ function adjustToUnicode(properties, builtInEncoding) {
      toUnicode[charCode] = String.fromCharCode(unicode);
    }
  }
-  properties.toUnicode.amend(toUnicode);
+  if (toUnicode.length > 0) {
+    properties.toUnicode.amend(toUnicode);
+  }
+}
+
+/**
+ * NOTE: This function should only be called at the *end* of font-parsing,
+ *       after e.g. `adjustToUnicode` has run, to prevent any issues.
+ */
+function amendFallbackToUnicode(properties) {
+  if (!properties.fallbackToUnicode) {
+    return;
+  }
+  if (properties.toUnicode instanceof IdentityToUnicodeMap) {
+    return;
+  }
+  const toUnicode = [];
+  for (const charCode in properties.fallbackToUnicode) {
+    if (properties.toUnicode.has(charCode)) {
+      continue; // The font dictionary has a `ToUnicode` entry.
+    }
+    toUnicode[charCode] = properties.fallbackToUnicode[charCode];
+  }
+  if (toUnicode.length > 0) {
+    properties.toUnicode.amend(toUnicode);
+  }
 }

 class Glyph {
@ -849,8 +877,6 @@ class Font {
    this.defaultEncoding = properties.defaultEncoding;

    this.toUnicode = properties.toUnicode;
-    this.fallbackToUnicode = properties.fallbackToUnicode || new ToUnicodeMap();
-
    this.toFontChar = [];

    if (properties.type === "Type3") {
@ -936,6 +962,7 @@ class Font {
      return;
    }

+    amendFallbackToUnicode(properties);
    this.data = data;
    this.fontType = getFontType(type, subtype, properties.isStandardFont);

@ -1094,6 +1121,8 @@ class Font {
      }
      this.toFontChar = map;
    }
+
+    amendFallbackToUnicode(properties);
    this.loadedName = fontName.split("-")[0];
    this.fontType = getFontType(type, subtype, properties.isStandardFont);
  }
@ -2545,12 +2574,9 @@ class Font {
        const glyphsUnicodeMap = getGlyphsUnicode();
        for (let charCode = 0; charCode < 256; charCode++) {
          let glyphName;
-          if (this.differences && charCode in this.differences) {
+          if (this.differences[charCode] !== undefined) {
            glyphName = this.differences[charCode];
-          } else if (
-            charCode in baseEncoding &&
-            baseEncoding[charCode] !== ""
-          ) {
+          } else if (baseEncoding[charCode] !== "") {
            glyphName = baseEncoding[charCode];
          } else {
            glyphName = StandardEncoding[charCode];
@ -2955,15 +2981,12 @@ class Font {
    width = isNum(width) ? width : this.defaultWidth;
    const vmetric = this.vmetrics && this.vmetrics[widthCode];

-    let unicode =
-      this.toUnicode.get(charcode) ||
-      this.fallbackToUnicode.get(charcode) ||
-      charcode;
+    let unicode = this.toUnicode.get(charcode) || charcode;
    if (typeof unicode === "number") {
      unicode = String.fromCharCode(unicode);
    }

-    let isInFont = charcode in this.toFontChar;
+    let isInFont = this.toFontChar[charcode] !== undefined;
    // First try the toFontChar map, if it's not there then try falling
    // back to the char code.
    fontCharCode = this.toFontChar[charcode] || charcode;