Merge pull request #13393 from Snuffleupagus/adjustToUnicode-hasIncludedToUnicodeMap

Tweak `adjustToUnicode` to allow extending a built-in /ToUnicode map
2021-06-16 17:06:17 +02:00 · 2021-06-16 17:06:17 +02:00 · 7fa61c062c
commit 7fa61c062c
parent f9a0568f96 229a49b9b9
2 changed files with 71 additions and 55 deletions
--- a/src/core/evaluator.js
+++ b/src/core/evaluator.js
@ -3178,10 +3178,10 @@ class PartialEvaluator {
  }
  /**
-   * @returns {ToUnicodeMap}
+   * @returns {Array}
   * @private
   */
-  _buildSimpleFontToUnicode(properties, forceGlyphs = false) {
+  _simpleFontToUnicode(properties, forceGlyphs = false) {
    assert(!properties.composite, "Must be a simple font.");
    const toUnicode = [];
@ -3242,7 +3242,7 @@ class PartialEvaluator {
                Number.isNaN(code) &&
                Number.isInteger(parseInt(codeStr, 16))
              ) {
-                return this._buildSimpleFontToUnicode(
+                return this._simpleFontToUnicode(
                  properties,
                  /* forceGlyphs */ true
                );
@ -3275,7 +3275,7 @@ class PartialEvaluator {
      }
      toUnicode[charcode] = String.fromCharCode(glyphsUnicodeMap[glyphName]);
    }
-    return new ToUnicodeMap(toUnicode);
+    return toUnicode;
  }
  /**
@ -3284,7 +3284,7 @@ class PartialEvaluator {
   * @returns {Promise} A Promise that is resolved with a
   *   {ToUnicodeMap|IdentityToUnicodeMap} object.
   */
-  buildToUnicode(properties) {
+  async buildToUnicode(properties) {
    properties.hasIncludedToUnicodeMap =
      !!properties.toUnicode && properties.toUnicode.length > 0;
@ -3294,11 +3294,9 @@ class PartialEvaluator {
      // text-extraction. For simple fonts, containing encoding information,
      // use a fallback ToUnicode map to improve this (fixes issue8229.pdf).
      if (!properties.composite && properties.hasEncoding) {
-        properties.fallbackToUnicode =
+        properties.fallbackToUnicode = this._simpleFontToUnicode(properties);
          this._buildSimpleFontToUnicode(properties);
      }
-
+      return properties.toUnicode;
      return Promise.resolve(properties.toUnicode);
    }
    // According to the spec if the font is a simple font we should only map
@ -3307,7 +3305,7 @@ class PartialEvaluator {
    // in pratice it seems better to always try to create a toUnicode map
    // based of the default encoding.
    if (!properties.composite /* is simple font */) {
-      return Promise.resolve(this._buildSimpleFontToUnicode(properties));
+      return new ToUnicodeMap(this._simpleFontToUnicode(properties));
    }
    // If the font is a composite font that uses one of the predefined CMaps
@ -3330,22 +3328,20 @@ class PartialEvaluator {
      // b) Obtain the registry and ordering of the character collection used
      // by the font’s CMap (for example, Adobe and Japan1) from its
      // CIDSystemInfo dictionary.
-      const registry = properties.cidSystemInfo.registry;
+      const { registry, ordering } = properties.cidSystemInfo;
      const ordering = properties.cidSystemInfo.ordering;
      // c) Construct a second CMap name by concatenating the registry and
      // ordering obtained in step (b) in the format registry–ordering–UCS2
      // (for example, Adobe–Japan1–UCS2).
-      const ucs2CMapName = Name.get(registry + "-" + ordering + "-UCS2");
+      const ucs2CMapName = Name.get(`${registry}-${ordering}-UCS2`);
      // d) Obtain the CMap with the name constructed in step (c) (available
      // from the ASN Web site; see the Bibliography).
-      return CMapFactory.create({
+      const ucs2CMap = await CMapFactory.create({
        encoding: ucs2CMapName,
        fetchBuiltInCMap: this._fetchBuiltInCMapBound,
        useCMap: null,
-      }).then(function (ucs2CMap) {
+      });
        const cMap = properties.cMap;
      const toUnicode = [];
-        cMap.forEach(function (charcode, cid) {
+      properties.cMap.forEach(function (charcode, cid) {
        if (cid > 0xffff) {
          throw new FormatError("Max size of CID is 65,535");
        }
@ -3359,13 +3355,10 @@ class PartialEvaluator {
        }
      });
      return new ToUnicodeMap(toUnicode);
      });
    }
    // The viewer's choice, just use an identity map.
-    return Promise.resolve(
+    return new IdentityToUnicodeMap(properties.firstChar, properties.lastChar);
      new IdentityToUnicodeMap(properties.firstChar, properties.lastChar)
    );
  }
  readToUnicode(cmapObj) {
--- a/src/core/fonts.js
+++ b/src/core/fonts.js
@ -135,9 +135,6 @@ function adjustToUnicode(properties, builtInEncoding) {
  if (properties.isInternalFont) {
    return;
  }
  if (properties.hasIncludedToUnicodeMap) {
    return; // The font dictionary has a `ToUnicode` entry.
  }
  if (builtInEncoding === properties.defaultEncoding) {
    return; // No point in trying to adjust `toUnicode` if the encodings match.
  }
@ -147,20 +144,51 @@ function adjustToUnicode(properties, builtInEncoding) {
  const toUnicode = [],
    glyphsUnicodeMap = getGlyphsUnicode();
  for (const charCode in builtInEncoding) {
    if (properties.hasIncludedToUnicodeMap) {
      if (properties.toUnicode.has(charCode)) {
        continue; // The font dictionary has a `ToUnicode` entry.
      }
    } else {
      if (
        properties.hasEncoding &&
        properties.differences[charCode] !== undefined
      ) {
        continue; // The font dictionary has an `Encoding`/`Differences` entry.
      }
    }
    const glyphName = builtInEncoding[charCode];
    const unicode = getUnicodeForGlyph(glyphName, glyphsUnicodeMap);
    if (unicode !== -1) {
      toUnicode[charCode] = String.fromCharCode(unicode);
    }
  }
  if (toUnicode.length > 0) {
    properties.toUnicode.amend(toUnicode);
  }
 }
 /**
 * NOTE: This function should only be called at the *end* of font-parsing,
 *       after e.g. `adjustToUnicode` has run, to prevent any issues.
 */
 function amendFallbackToUnicode(properties) {
  if (!properties.fallbackToUnicode) {
    return;
  }
  if (properties.toUnicode instanceof IdentityToUnicodeMap) {
    return;
  }
  const toUnicode = [];
  for (const charCode in properties.fallbackToUnicode) {
    if (properties.toUnicode.has(charCode)) {
      continue; // The font dictionary has a `ToUnicode` entry.
    }
    toUnicode[charCode] = properties.fallbackToUnicode[charCode];
  }
  if (toUnicode.length > 0) {
    properties.toUnicode.amend(toUnicode);
  }
 }
 class Glyph {
  constructor(
@ -849,8 +877,6 @@ class Font {
    this.defaultEncoding = properties.defaultEncoding;
    this.toUnicode = properties.toUnicode;
    this.fallbackToUnicode = properties.fallbackToUnicode || new ToUnicodeMap();
    this.toFontChar = [];
    if (properties.type === "Type3") {
@ -936,6 +962,7 @@ class Font {
      return;
    }
    amendFallbackToUnicode(properties);
    this.data = data;
    this.fontType = getFontType(type, subtype, properties.isStandardFont);
@ -1094,6 +1121,8 @@ class Font {
      }
      this.toFontChar = map;
    }
    amendFallbackToUnicode(properties);
    this.loadedName = fontName.split("-")[0];
    this.fontType = getFontType(type, subtype, properties.isStandardFont);
  }
@ -2545,12 +2574,9 @@ class Font {
        const glyphsUnicodeMap = getGlyphsUnicode();
        for (let charCode = 0; charCode < 256; charCode++) {
          let glyphName;
-          if (this.differences && charCode in this.differences) {
+          if (this.differences[charCode] !== undefined) {
            glyphName = this.differences[charCode];
-          } else if (
+          } else if (baseEncoding[charCode] !== "") {
            charCode in baseEncoding &&
            baseEncoding[charCode] !== ""
          ) {
            glyphName = baseEncoding[charCode];
          } else {
            glyphName = StandardEncoding[charCode];
@ -2955,15 +2981,12 @@ class Font {
    width = isNum(width) ? width : this.defaultWidth;
    const vmetric = this.vmetrics && this.vmetrics[widthCode];
-    let unicode =
+    let unicode = this.toUnicode.get(charcode) || charcode;
      this.toUnicode.get(charcode) ||
      this.fallbackToUnicode.get(charcode) ||
      charcode;
    if (typeof unicode === "number") {
      unicode = String.fromCharCode(unicode);
    }
-    let isInFont = charcode in this.toFontChar;
+    let isInFont = this.toFontChar[charcode] !== undefined;
    // First try the toFontChar map, if it's not there then try falling
    // back to the char code.
    fontCharCode = this.toFontChar[charcode] || charcode;