Refactor the building of ToUnicode maps for simple fonts a helper method

2017-11-26 12:53:06 +01:00 · 2017-11-26 12:53:06 +01:00 · ffbfc3c2a7
commit ffbfc3c2a7
parent ada47fe373
1 changed files with 91 additions and 81 deletions
--- a/src/core/evaluator.js
+++ b/src/core/evaluator.js
@ -1932,30 +1932,17 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
    },
    /**
-     * Builds a char code to unicode map based on section 9.10 of the spec.
+     * @returns {ToUnicodeMap}
-     * @param {Object} properties Font properties object.
+     * @private
     * @return {Promise} A Promise that is resolved with a
     *   {ToUnicodeMap|IdentityToUnicodeMap} object.
     */
-    buildToUnicode: function PartialEvaluator_buildToUnicode(properties) {
+    _buildSimpleFontToUnicode(properties) {
-      properties.hasIncludedToUnicodeMap =
+      assert(!properties.composite, 'Must be a simple font.');
-        !!properties.toUnicode && properties.toUnicode.length > 0;
+
-      // Section 9.10.2 Mapping Character Codes to Unicode Values
+      let toUnicode = [], charcode, glyphName;
-      if (properties.hasIncludedToUnicodeMap) {
+      let encoding = properties.defaultEncoding.slice();
-        return Promise.resolve(properties.toUnicode);
+      let baseEncodingName = properties.baseEncodingName;
      }
      // According to the spec if the font is a simple font we should only map
      // to unicode if the base encoding is MacRoman, MacExpert, or WinAnsi or
      // the differences array only contains adobe standard or symbol set names,
      // in pratice it seems better to always try to create a toUnicode
      // map based of the default encoding.
      var toUnicode, charcode, glyphName;
      if (!properties.composite /* is simple font */) {
        toUnicode = [];
        var encoding = properties.defaultEncoding.slice();
        var baseEncodingName = properties.baseEncodingName;
      // Merge in the differences array.
-        var differences = properties.differences;
+      let differences = properties.differences;
      for (charcode in differences) {
        glyphName = differences[charcode];
        if (glyphName === '.notdef') {
@ -1965,7 +1952,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
        }
        encoding[charcode] = glyphName;
      }
-        var glyphsUnicodeMap = getGlyphsUnicode();
+      let glyphsUnicodeMap = getGlyphsUnicode();
      for (charcode in encoding) {
        // a) Map the character code to a character name.
        glyphName = encoding[charcode];
@ -1976,7 +1963,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
        } else if (glyphsUnicodeMap[glyphName] === undefined) {
          // (undocumented) c) Few heuristics to recognize unknown glyphs
          // NOTE: Adobe Reader does not do this step, but OSX Preview does
-            var code = 0;
+          let code = 0;
          switch (glyphName[0]) {
            case 'G': // Gxx glyph
              if (glyphName.length === 3) {
@ -1996,18 +1983,17 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
              break;
            default:
              // 'uniXXXX'/'uXXXX{XX}' glyphs
-                var unicode = getUnicodeForGlyph(glyphName, glyphsUnicodeMap);
+              let unicode = getUnicodeForGlyph(glyphName, glyphsUnicodeMap);
              if (unicode !== -1) {
                code = unicode;
              }
          }
          if (code) {
-              // If |baseEncodingName| is one the predefined encodings,
+            // If `baseEncodingName` is one the predefined encodings, and `code`
-              // and |code| equals |charcode|, using the glyph defined in the
+            // equals `charcode`, using the glyph defined in the baseEncoding
-              // baseEncoding seems to yield a better |toUnicode| mapping
+            // seems to yield a better `toUnicode` mapping (fixes issue 5070).
              // (fixes issue 5070).
            if (baseEncodingName && code === +charcode) {
-                var baseEncoding = getEncoding(baseEncodingName);
+              let baseEncoding = getEncoding(baseEncodingName);
              if (baseEncoding && (glyphName = baseEncoding[charcode])) {
                toUnicode[charcode] =
                  String.fromCharCode(glyphsUnicodeMap[glyphName]);
@ -2018,11 +2004,35 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
          }
          continue;
        }
-          toUnicode[charcode] =
+        toUnicode[charcode] = String.fromCharCode(glyphsUnicodeMap[glyphName]);
            String.fromCharCode(glyphsUnicodeMap[glyphName]);
      }
-        return Promise.resolve(new ToUnicodeMap(toUnicode));
+      return new ToUnicodeMap(toUnicode);
    },
    /**
     * Builds a char code to unicode map based on section 9.10 of the spec.
     * @param {Object} properties Font properties object.
     * @return {Promise} A Promise that is resolved with a
     *   {ToUnicodeMap|IdentityToUnicodeMap} object.
     */
    buildToUnicode(properties) {
      properties.hasIncludedToUnicodeMap =
        !!properties.toUnicode && properties.toUnicode.length > 0;
      // Section 9.10.2 Mapping Character Codes to Unicode Values
      if (properties.hasIncludedToUnicodeMap) {
        return Promise.resolve(properties.toUnicode);
      }
      // According to the spec if the font is a simple font we should only map
      // to unicode if the base encoding is MacRoman, MacExpert, or WinAnsi or
      // the differences array only contains adobe standard or symbol set names,
      // in pratice it seems better to always try to create a toUnicode map
      // based of the default encoding.
      if (!properties.composite /* is simple font */) {
        return Promise.resolve(this._buildSimpleFontToUnicode(properties));
      }
      // If the font is a composite font that uses one of the predefined CMaps
      // listed in Table 118 (except Identity–H and Identity–V) or whose
      // descendant CIDFont uses the Adobe-GB1, Adobe-CNS1, Adobe-Japan1, or
@ -2041,12 +2051,12 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
        // b) Obtain the registry and ordering of the character collection used
        // by the font’s CMap (for example, Adobe and Japan1) from its
        // CIDSystemInfo dictionary.
-        var registry = properties.cidSystemInfo.registry;
+        let registry = properties.cidSystemInfo.registry;
-        var ordering = properties.cidSystemInfo.ordering;
+        let ordering = properties.cidSystemInfo.ordering;
        // c) Construct a second CMap name by concatenating the registry and
        // ordering obtained in step (b) in the format registry–ordering–UCS2
        // (for example, Adobe–Japan1–UCS2).
-        var ucs2CMapName = Name.get(registry + '-' + ordering + '-UCS2');
+        let ucs2CMapName = Name.get(registry + '-' + ordering + '-UCS2');
        // d) Obtain the CMap with the name constructed in step (c) (available
        // from the ASN Web site; see the Bibliography).
        return CMapFactory.create({
@ -2054,15 +2064,15 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
          fetchBuiltInCMap: this.fetchBuiltInCMap,
          useCMap: null,
        }).then(function (ucs2CMap) {
-          var cMap = properties.cMap;
+          let cMap = properties.cMap;
-          toUnicode = [];
+          let toUnicode = [];
          cMap.forEach(function(charcode, cid) {
            if (cid > 0xffff) {
              throw new FormatError('Max size of CID is 65,535');
            }
            // e) Map the CID obtained in step (a) according to the CMap
            // obtained in step (d), producing a Unicode value.
-            var ucs2 = ucs2CMap.lookup(cid);
+            let ucs2 = ucs2CMap.lookup(cid);
            if (ucs2) {
              toUnicode[charcode] =
                String.fromCharCode((ucs2.charCodeAt(0) << 8) +