Re-factor the fallbackToUnicode functionality (PR 9192 follow-up)

Rather than having to create and check a *separate* `ToUnicodeMap` to handle these cases, we can simply use the `fallbackToUnicode`-data (when it exists) to directly supplement *missing* /ToUnicode entires in the regular `ToUnicodeMap` instead.
2021-05-18 13:45:19 +02:00 · 2021-05-18 13:45:19 +02:00 · 229a49b9b9
commit 229a49b9b9
parent 7190bc23a8
2 changed files with 34 additions and 14 deletions
--- a/src/core/evaluator.js
+++ b/src/core/evaluator.js
@ -3178,10 +3178,10 @@ class PartialEvaluator {
  }

  /**
-   * @returns {ToUnicodeMap}
+   * @returns {Array}
   * @private
   */
-  _buildSimpleFontToUnicode(properties, forceGlyphs = false) {
+  _simpleFontToUnicode(properties, forceGlyphs = false) {
    assert(!properties.composite, "Must be a simple font.");

    const toUnicode = [];
@ -3242,7 +3242,7 @@ class PartialEvaluator {
                Number.isNaN(code) &&
                Number.isInteger(parseInt(codeStr, 16))
              ) {
-                return this._buildSimpleFontToUnicode(
+                return this._simpleFontToUnicode(
                  properties,
                  /* forceGlyphs */ true
                );
@ -3275,7 +3275,7 @@ class PartialEvaluator {
      }
      toUnicode[charcode] = String.fromCharCode(glyphsUnicodeMap[glyphName]);
    }
-    return new ToUnicodeMap(toUnicode);
+    return toUnicode;
  }

  /**
@ -3294,8 +3294,7 @@ class PartialEvaluator {
      // text-extraction. For simple fonts, containing encoding information,
      // use a fallback ToUnicode map to improve this (fixes issue8229.pdf).
      if (!properties.composite && properties.hasEncoding) {
-        properties.fallbackToUnicode =
-          this._buildSimpleFontToUnicode(properties);
+        properties.fallbackToUnicode = this._simpleFontToUnicode(properties);
      }
      return properties.toUnicode;
    }
@ -3306,7 +3305,7 @@ class PartialEvaluator {
    // in pratice it seems better to always try to create a toUnicode map
    // based of the default encoding.
    if (!properties.composite /* is simple font */) {
-      return this._buildSimpleFontToUnicode(properties);
+      return new ToUnicodeMap(this._simpleFontToUnicode(properties));
    }

    // If the font is a composite font that uses one of the predefined CMaps
--- a/src/core/fonts.js
+++ b/src/core/fonts.js
@ -167,6 +167,29 @@ function adjustToUnicode(properties, builtInEncoding) {
  }
 }

+/**
+ * NOTE: This function should only be called at the *end* of font-parsing,
+ *       after e.g. `adjustToUnicode` has run, to prevent any issues.
+ */
+function amendFallbackToUnicode(properties) {
+  if (!properties.fallbackToUnicode) {
+    return;
+  }
+  if (properties.toUnicode instanceof IdentityToUnicodeMap) {
+    return;
+  }
+  const toUnicode = [];
+  for (const charCode in properties.fallbackToUnicode) {
+    if (properties.toUnicode.has(charCode)) {
+      continue; // The font dictionary has a `ToUnicode` entry.
+    }
+    toUnicode[charCode] = properties.fallbackToUnicode[charCode];
+  }
+  if (toUnicode.length > 0) {
+    properties.toUnicode.amend(toUnicode);
+  }
+}
+
 class Glyph {
  constructor(
    originalCharCode,
@ -854,8 +877,6 @@ class Font {
    this.defaultEncoding = properties.defaultEncoding;

    this.toUnicode = properties.toUnicode;
-    this.fallbackToUnicode = properties.fallbackToUnicode || new ToUnicodeMap();
-
    this.toFontChar = [];

    if (properties.type === "Type3") {
@ -941,6 +962,7 @@ class Font {
      return;
    }

+    amendFallbackToUnicode(properties);
    this.data = data;
    this.fontType = getFontType(type, subtype, properties.isStandardFont);

@ -1099,6 +1121,8 @@ class Font {
      }
      this.toFontChar = map;
    }
+
+    amendFallbackToUnicode(properties);
    this.loadedName = fontName.split("-")[0];
    this.fontType = getFontType(type, subtype, properties.isStandardFont);
  }
@ -2957,15 +2981,12 @@ class Font {
    width = isNum(width) ? width : this.defaultWidth;
    const vmetric = this.vmetrics && this.vmetrics[widthCode];

-    let unicode =
-      this.toUnicode.get(charcode) ||
-      this.fallbackToUnicode.get(charcode) ||
-      charcode;
+    let unicode = this.toUnicode.get(charcode) || charcode;
    if (typeof unicode === "number") {
      unicode = String.fromCharCode(unicode);
    }

-    let isInFont = charcode in this.toFontChar;
+    let isInFont = this.toFontChar[charcode] !== undefined;
    // First try the toFontChar map, if it's not there then try falling
    // back to the char code.
    fontCharCode = this.toFontChar[charcode] || charcode;