From 3660aaac85e6be571c238e969d9cf5ca598e4215 Mon Sep 17 00:00:00 2001
From: Jonas Jenwald <jonas.jenwald@gmail.com>
Date: Mon, 17 May 2021 14:34:08 +0200
Subject: [PATCH 1/4] Tweak `adjustToUnicode` to allow extending a built-in
 /ToUnicode map

*This is somewhat similiar to the recent changes, in PR 13277, for fonts with an /Encoding entry.*

Currently we're *completely* ignoring the `builtInEncoding`, from the font data itself, for fonts which have a built-in /ToUnicode map.
While it (obviously) doesn't seem like a good idea in general to simply overwrite existing built-in /ToUnicode entries, it should however not hurt to use the `builtInEncoding` to supplement *missing* /ToUnicode entires.
---
 src/core/fonts.js | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/core/fonts.js b/src/core/fonts.js
index 8b8ec434d..b391ab693 100644
--- a/src/core/fonts.js
+++ b/src/core/fonts.js
@@ -135,9 +135,6 @@ function adjustToUnicode(properties, builtInEncoding) {
   if (properties.isInternalFont) {
     return;
   }
-  if (properties.hasIncludedToUnicodeMap) {
-    return; // The font dictionary has a `ToUnicode` entry.
-  }
   if (builtInEncoding === properties.defaultEncoding) {
     return; // No point in trying to adjust `toUnicode` if the encodings match.
   }
@@ -147,11 +144,17 @@ function adjustToUnicode(properties, builtInEncoding) {
   const toUnicode = [],
     glyphsUnicodeMap = getGlyphsUnicode();
   for (const charCode in builtInEncoding) {
-    if (
-      properties.hasEncoding &&
-      properties.differences[charCode] !== undefined
-    ) {
-      continue; // The font dictionary has an `Encoding`/`Differences` entry.
+    if (properties.hasIncludedToUnicodeMap) {
+      if (properties.toUnicode.has(charCode)) {
+        continue; // The font dictionary has a `ToUnicode` entry.
+      }
+    } else {
+      if (
+        properties.hasEncoding &&
+        properties.differences[charCode] !== undefined
+      ) {
+        continue; // The font dictionary has an `Encoding`/`Differences` entry.
+      }
     }
     const glyphName = builtInEncoding[charCode];
     const unicode = getUnicodeForGlyph(glyphName, glyphsUnicodeMap);
@@ -159,7 +162,9 @@ function adjustToUnicode(properties, builtInEncoding) {
       toUnicode[charCode] = String.fromCharCode(unicode);
     }
   }
-  properties.toUnicode.amend(toUnicode);
+  if (toUnicode.length > 0) {
+    properties.toUnicode.amend(toUnicode);
+  }
 }
 
 class Glyph {

From edc38de37ac7a1ba659b59598a254c38ac1fcf33 Mon Sep 17 00:00:00 2001
From: Jonas Jenwald <jonas.jenwald@gmail.com>
Date: Mon, 17 May 2021 15:40:23 +0200
Subject: [PATCH 2/4] Convert `PartialEvaluator.buildToUnicode` to an `async`
 method

This removes the need to *manually* wrap all return values in a Promise.
---
 src/core/evaluator.js | 50 +++++++++++++++++++------------------------
 1 file changed, 22 insertions(+), 28 deletions(-)

diff --git a/src/core/evaluator.js b/src/core/evaluator.js
index 642e6aa47..4ef6f6e57 100644
--- a/src/core/evaluator.js
+++ b/src/core/evaluator.js
@@ -3284,7 +3284,7 @@ class PartialEvaluator {
    * @returns {Promise} A Promise that is resolved with a
    *   {ToUnicodeMap|IdentityToUnicodeMap} object.
    */
-  buildToUnicode(properties) {
+  async buildToUnicode(properties) {
     properties.hasIncludedToUnicodeMap =
       !!properties.toUnicode && properties.toUnicode.length > 0;
 
@@ -3297,8 +3297,7 @@ class PartialEvaluator {
         properties.fallbackToUnicode =
           this._buildSimpleFontToUnicode(properties);
       }
-
-      return Promise.resolve(properties.toUnicode);
+      return properties.toUnicode;
     }
 
     // According to the spec if the font is a simple font we should only map
@@ -3307,7 +3306,7 @@ class PartialEvaluator {
     // in pratice it seems better to always try to create a toUnicode map
     // based of the default encoding.
     if (!properties.composite /* is simple font */) {
-      return Promise.resolve(this._buildSimpleFontToUnicode(properties));
+      return this._buildSimpleFontToUnicode(properties);
     }
 
     // If the font is a composite font that uses one of the predefined CMaps
@@ -3330,42 +3329,37 @@ class PartialEvaluator {
       // b) Obtain the registry and ordering of the character collection used
       // by the font’s CMap (for example, Adobe and Japan1) from its
       // CIDSystemInfo dictionary.
-      const registry = properties.cidSystemInfo.registry;
-      const ordering = properties.cidSystemInfo.ordering;
+      const { registry, ordering } = properties.cidSystemInfo;
       // c) Construct a second CMap name by concatenating the registry and
       // ordering obtained in step (b) in the format registry–ordering–UCS2
       // (for example, Adobe–Japan1–UCS2).
-      const ucs2CMapName = Name.get(registry + "-" + ordering + "-UCS2");
+      const ucs2CMapName = Name.get(`${registry}-${ordering}-UCS2`);
       // d) Obtain the CMap with the name constructed in step (c) (available
       // from the ASN Web site; see the Bibliography).
-      return CMapFactory.create({
+      const ucs2CMap = await CMapFactory.create({
         encoding: ucs2CMapName,
         fetchBuiltInCMap: this._fetchBuiltInCMapBound,
         useCMap: null,
-      }).then(function (ucs2CMap) {
-        const cMap = properties.cMap;
-        const toUnicode = [];
-        cMap.forEach(function (charcode, cid) {
-          if (cid > 0xffff) {
-            throw new FormatError("Max size of CID is 65,535");
-          }
-          // e) Map the CID obtained in step (a) according to the CMap
-          // obtained in step (d), producing a Unicode value.
-          const ucs2 = ucs2CMap.lookup(cid);
-          if (ucs2) {
-            toUnicode[charcode] = String.fromCharCode(
-              (ucs2.charCodeAt(0) << 8) + ucs2.charCodeAt(1)
-            );
-          }
-        });
-        return new ToUnicodeMap(toUnicode);
       });
+      const toUnicode = [];
+      properties.cMap.forEach(function (charcode, cid) {
+        if (cid > 0xffff) {
+          throw new FormatError("Max size of CID is 65,535");
+        }
+        // e) Map the CID obtained in step (a) according to the CMap
+        // obtained in step (d), producing a Unicode value.
+        const ucs2 = ucs2CMap.lookup(cid);
+        if (ucs2) {
+          toUnicode[charcode] = String.fromCharCode(
+            (ucs2.charCodeAt(0) << 8) + ucs2.charCodeAt(1)
+          );
+        }
+      });
+      return new ToUnicodeMap(toUnicode);
     }
 
     // The viewer's choice, just use an identity map.
-    return Promise.resolve(
-      new IdentityToUnicodeMap(properties.firstChar, properties.lastChar)
-    );
+    return new IdentityToUnicodeMap(properties.firstChar, properties.lastChar);
   }
 
   readToUnicode(cmapObj) {

From 7190bc23a856b12b06e91d78eb4564e9a0a9a775 Mon Sep 17 00:00:00 2001
From: Jonas Jenwald <jonas.jenwald@gmail.com>
Date: Tue, 18 May 2021 09:04:14 +0200
Subject: [PATCH 3/4] Remove unnecessary `in` checks of Arrays, when building
 the `charCodeToGlyphId` for TrueType fonts

Note that all standard Encodings have the same length (i.e. `256` elements) and that missing entries are always represented by empty strings, hence why a separate exists-check isn't necessary in the `baseEncoding` case.
---
 src/core/fonts.js | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/src/core/fonts.js b/src/core/fonts.js
index b391ab693..6fb746197 100644
--- a/src/core/fonts.js
+++ b/src/core/fonts.js
@@ -2550,12 +2550,9 @@ class Font {
         const glyphsUnicodeMap = getGlyphsUnicode();
         for (let charCode = 0; charCode < 256; charCode++) {
           let glyphName;
-          if (this.differences && charCode in this.differences) {
+          if (this.differences[charCode] !== undefined) {
             glyphName = this.differences[charCode];
-          } else if (
-            charCode in baseEncoding &&
-            baseEncoding[charCode] !== ""
-          ) {
+          } else if (baseEncoding[charCode] !== "") {
             glyphName = baseEncoding[charCode];
           } else {
             glyphName = StandardEncoding[charCode];

From 229a49b9b902768ed19cdafad18ad609b341d15f Mon Sep 17 00:00:00 2001
From: Jonas Jenwald <jonas.jenwald@gmail.com>
Date: Tue, 18 May 2021 13:45:19 +0200
Subject: [PATCH 4/4] Re-factor the `fallbackToUnicode` functionality (PR 9192
 follow-up)

Rather than having to create and check a *separate* `ToUnicodeMap` to handle these cases, we can simply use the `fallbackToUnicode`-data (when it exists) to directly supplement *missing* /ToUnicode entires in the regular `ToUnicodeMap` instead.
---
 src/core/evaluator.js | 13 ++++++-------
 src/core/fonts.js     | 35 ++++++++++++++++++++++++++++-------
 2 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/src/core/evaluator.js b/src/core/evaluator.js
index 4ef6f6e57..2eb9847c9 100644
--- a/src/core/evaluator.js
+++ b/src/core/evaluator.js
@@ -3178,10 +3178,10 @@ class PartialEvaluator {
   }
 
   /**
-   * @returns {ToUnicodeMap}
+   * @returns {Array}
    * @private
    */
-  _buildSimpleFontToUnicode(properties, forceGlyphs = false) {
+  _simpleFontToUnicode(properties, forceGlyphs = false) {
     assert(!properties.composite, "Must be a simple font.");
 
     const toUnicode = [];
@@ -3242,7 +3242,7 @@ class PartialEvaluator {
                 Number.isNaN(code) &&
                 Number.isInteger(parseInt(codeStr, 16))
               ) {
-                return this._buildSimpleFontToUnicode(
+                return this._simpleFontToUnicode(
                   properties,
                   /* forceGlyphs */ true
                 );
@@ -3275,7 +3275,7 @@ class PartialEvaluator {
       }
       toUnicode[charcode] = String.fromCharCode(glyphsUnicodeMap[glyphName]);
     }
-    return new ToUnicodeMap(toUnicode);
+    return toUnicode;
   }
 
   /**
@@ -3294,8 +3294,7 @@ class PartialEvaluator {
       // text-extraction. For simple fonts, containing encoding information,
       // use a fallback ToUnicode map to improve this (fixes issue8229.pdf).
       if (!properties.composite && properties.hasEncoding) {
-        properties.fallbackToUnicode =
-          this._buildSimpleFontToUnicode(properties);
+        properties.fallbackToUnicode = this._simpleFontToUnicode(properties);
       }
       return properties.toUnicode;
     }
@@ -3306,7 +3305,7 @@ class PartialEvaluator {
     // in pratice it seems better to always try to create a toUnicode map
     // based of the default encoding.
     if (!properties.composite /* is simple font */) {
-      return this._buildSimpleFontToUnicode(properties);
+      return new ToUnicodeMap(this._simpleFontToUnicode(properties));
     }
 
     // If the font is a composite font that uses one of the predefined CMaps
diff --git a/src/core/fonts.js b/src/core/fonts.js
index 6fb746197..370035ffe 100644
--- a/src/core/fonts.js
+++ b/src/core/fonts.js
@@ -167,6 +167,29 @@ function adjustToUnicode(properties, builtInEncoding) {
   }
 }
 
+/**
+ * NOTE: This function should only be called at the *end* of font-parsing,
+ *       after e.g. `adjustToUnicode` has run, to prevent any issues.
+ */
+function amendFallbackToUnicode(properties) {
+  if (!properties.fallbackToUnicode) {
+    return;
+  }
+  if (properties.toUnicode instanceof IdentityToUnicodeMap) {
+    return;
+  }
+  const toUnicode = [];
+  for (const charCode in properties.fallbackToUnicode) {
+    if (properties.toUnicode.has(charCode)) {
+      continue; // The font dictionary has a `ToUnicode` entry.
+    }
+    toUnicode[charCode] = properties.fallbackToUnicode[charCode];
+  }
+  if (toUnicode.length > 0) {
+    properties.toUnicode.amend(toUnicode);
+  }
+}
+
 class Glyph {
   constructor(
     originalCharCode,
@@ -854,8 +877,6 @@ class Font {
     this.defaultEncoding = properties.defaultEncoding;
 
     this.toUnicode = properties.toUnicode;
-    this.fallbackToUnicode = properties.fallbackToUnicode || new ToUnicodeMap();
-
     this.toFontChar = [];
 
     if (properties.type === "Type3") {
@@ -941,6 +962,7 @@ class Font {
       return;
     }
 
+    amendFallbackToUnicode(properties);
     this.data = data;
     this.fontType = getFontType(type, subtype, properties.isStandardFont);
 
@@ -1099,6 +1121,8 @@ class Font {
       }
       this.toFontChar = map;
     }
+
+    amendFallbackToUnicode(properties);
     this.loadedName = fontName.split("-")[0];
     this.fontType = getFontType(type, subtype, properties.isStandardFont);
   }
@@ -2957,15 +2981,12 @@ class Font {
     width = isNum(width) ? width : this.defaultWidth;
     const vmetric = this.vmetrics && this.vmetrics[widthCode];
 
-    let unicode =
-      this.toUnicode.get(charcode) ||
-      this.fallbackToUnicode.get(charcode) ||
-      charcode;
+    let unicode = this.toUnicode.get(charcode) || charcode;
     if (typeof unicode === "number") {
       unicode = String.fromCharCode(unicode);
     }
 
-    let isInFont = charcode in this.toFontChar;
+    let isInFont = this.toFontChar[charcode] !== undefined;
     // First try the toFontChar map, if it's not there then try falling
     // back to the char code.
     fontCharCode = this.toFontChar[charcode] || charcode;