Refactor the building of ToUnicode maps for simple fonts a helper method

2017-11-26 12:53:06 +01:00 · 2017-11-26 12:53:06 +01:00 · ffbfc3c2a7
commit ffbfc3c2a7
parent ada47fe373
1 changed files with 91 additions and 81 deletions
--- a/src/core/evaluator.js
+++ b/src/core/evaluator.js
@ -1931,98 +1931,108 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
      });
    },
    /**
     * @returns {ToUnicodeMap}
     * @private
     */
    _buildSimpleFontToUnicode(properties) {
      assert(!properties.composite, 'Must be a simple font.');
      let toUnicode = [], charcode, glyphName;
      let encoding = properties.defaultEncoding.slice();
      let baseEncodingName = properties.baseEncodingName;
      // Merge in the differences array.
      let differences = properties.differences;
      for (charcode in differences) {
        glyphName = differences[charcode];
        if (glyphName === '.notdef') {
          // Skip .notdef to prevent rendering errors, e.g. boxes appearing
          // where there should be spaces (fixes issue5256.pdf).
          continue;
        }
        encoding[charcode] = glyphName;
      }
      let glyphsUnicodeMap = getGlyphsUnicode();
      for (charcode in encoding) {
        // a) Map the character code to a character name.
        glyphName = encoding[charcode];
        // b) Look up the character name in the Adobe Glyph List (see the
        //    Bibliography) to obtain the corresponding Unicode value.
        if (glyphName === '') {
          continue;
        } else if (glyphsUnicodeMap[glyphName] === undefined) {
          // (undocumented) c) Few heuristics to recognize unknown glyphs
          // NOTE: Adobe Reader does not do this step, but OSX Preview does
          let code = 0;
          switch (glyphName[0]) {
            case 'G': // Gxx glyph
              if (glyphName.length === 3) {
                code = parseInt(glyphName.substr(1), 16);
              }
              break;
            case 'g': // g00xx glyph
              if (glyphName.length === 5) {
                code = parseInt(glyphName.substr(1), 16);
              }
              break;
            case 'C': // Cddd glyph
            case 'c': // cddd glyph
              if (glyphName.length >= 3) {
                code = +glyphName.substr(1);
              }
              break;
            default:
              // 'uniXXXX'/'uXXXX{XX}' glyphs
              let unicode = getUnicodeForGlyph(glyphName, glyphsUnicodeMap);
              if (unicode !== -1) {
                code = unicode;
              }
          }
          if (code) {
            // If `baseEncodingName` is one the predefined encodings, and `code`
            // equals `charcode`, using the glyph defined in the baseEncoding
            // seems to yield a better `toUnicode` mapping (fixes issue 5070).
            if (baseEncodingName && code === +charcode) {
              let baseEncoding = getEncoding(baseEncodingName);
              if (baseEncoding && (glyphName = baseEncoding[charcode])) {
                toUnicode[charcode] =
                  String.fromCharCode(glyphsUnicodeMap[glyphName]);
                continue;
              }
            }
            toUnicode[charcode] = String.fromCharCode(code);
          }
          continue;
        }
        toUnicode[charcode] = String.fromCharCode(glyphsUnicodeMap[glyphName]);
      }
      return new ToUnicodeMap(toUnicode);
    },
    /**
     * Builds a char code to unicode map based on section 9.10 of the spec.
     * @param {Object} properties Font properties object.
     * @return {Promise} A Promise that is resolved with a
     *   {ToUnicodeMap|IdentityToUnicodeMap} object.
     */
-    buildToUnicode: function PartialEvaluator_buildToUnicode(properties) {
+    buildToUnicode(properties) {
      properties.hasIncludedToUnicodeMap =
        !!properties.toUnicode && properties.toUnicode.length > 0;
      // Section 9.10.2 Mapping Character Codes to Unicode Values
      if (properties.hasIncludedToUnicodeMap) {
        return Promise.resolve(properties.toUnicode);
      }
      // According to the spec if the font is a simple font we should only map
      // to unicode if the base encoding is MacRoman, MacExpert, or WinAnsi or
      // the differences array only contains adobe standard or symbol set names,
-      // in pratice it seems better to always try to create a toUnicode
+      // in pratice it seems better to always try to create a toUnicode map
-      // map based of the default encoding.
+      // based of the default encoding.
      var toUnicode, charcode, glyphName;
      if (!properties.composite /* is simple font */) {
-        toUnicode = [];
+        return Promise.resolve(this._buildSimpleFontToUnicode(properties));
        var encoding = properties.defaultEncoding.slice();
        var baseEncodingName = properties.baseEncodingName;
        // Merge in the differences array.
        var differences = properties.differences;
        for (charcode in differences) {
          glyphName = differences[charcode];
          if (glyphName === '.notdef') {
            // Skip .notdef to prevent rendering errors, e.g. boxes appearing
            // where there should be spaces (fixes issue5256.pdf).
            continue;
          }
          encoding[charcode] = glyphName;
        }
        var glyphsUnicodeMap = getGlyphsUnicode();
        for (charcode in encoding) {
          // a) Map the character code to a character name.
          glyphName = encoding[charcode];
          // b) Look up the character name in the Adobe Glyph List (see the
          //    Bibliography) to obtain the corresponding Unicode value.
          if (glyphName === '') {
            continue;
          } else if (glyphsUnicodeMap[glyphName] === undefined) {
            // (undocumented) c) Few heuristics to recognize unknown glyphs
            // NOTE: Adobe Reader does not do this step, but OSX Preview does
            var code = 0;
            switch (glyphName[0]) {
              case 'G': // Gxx glyph
                if (glyphName.length === 3) {
                  code = parseInt(glyphName.substr(1), 16);
                }
                break;
              case 'g': // g00xx glyph
                if (glyphName.length === 5) {
                  code = parseInt(glyphName.substr(1), 16);
                }
                break;
              case 'C': // Cddd glyph
              case 'c': // cddd glyph
                if (glyphName.length >= 3) {
                  code = +glyphName.substr(1);
                }
                break;
              default:
                // 'uniXXXX'/'uXXXX{XX}' glyphs
                var unicode = getUnicodeForGlyph(glyphName, glyphsUnicodeMap);
                if (unicode !== -1) {
                  code = unicode;
                }
            }
            if (code) {
              // If |baseEncodingName| is one the predefined encodings,
              // and |code| equals |charcode|, using the glyph defined in the
              // baseEncoding seems to yield a better |toUnicode| mapping
              // (fixes issue 5070).
              if (baseEncodingName && code === +charcode) {
                var baseEncoding = getEncoding(baseEncodingName);
                if (baseEncoding && (glyphName = baseEncoding[charcode])) {
                  toUnicode[charcode] =
                    String.fromCharCode(glyphsUnicodeMap[glyphName]);
                  continue;
                }
              }
              toUnicode[charcode] = String.fromCharCode(code);
            }
            continue;
          }
          toUnicode[charcode] =
            String.fromCharCode(glyphsUnicodeMap[glyphName]);
        }
        return Promise.resolve(new ToUnicodeMap(toUnicode));
      }
      // If the font is a composite font that uses one of the predefined CMaps
      // listed in Table 118 (except Identity–H and Identity–V) or whose
      // descendant CIDFont uses the Adobe-GB1, Adobe-CNS1, Adobe-Japan1, or
@ -2041,12 +2051,12 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
        // b) Obtain the registry and ordering of the character collection used
        // by the font’s CMap (for example, Adobe and Japan1) from its
        // CIDSystemInfo dictionary.
-        var registry = properties.cidSystemInfo.registry;
+        let registry = properties.cidSystemInfo.registry;
-        var ordering = properties.cidSystemInfo.ordering;
+        let ordering = properties.cidSystemInfo.ordering;
        // c) Construct a second CMap name by concatenating the registry and
        // ordering obtained in step (b) in the format registry–ordering–UCS2
        // (for example, Adobe–Japan1–UCS2).
-        var ucs2CMapName = Name.get(registry + '-' + ordering + '-UCS2');
+        let ucs2CMapName = Name.get(registry + '-' + ordering + '-UCS2');
        // d) Obtain the CMap with the name constructed in step (c) (available
        // from the ASN Web site; see the Bibliography).
        return CMapFactory.create({
@ -2054,15 +2064,15 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
          fetchBuiltInCMap: this.fetchBuiltInCMap,
          useCMap: null,
        }).then(function (ucs2CMap) {
-          var cMap = properties.cMap;
+          let cMap = properties.cMap;
-          toUnicode = [];
+          let toUnicode = [];
          cMap.forEach(function(charcode, cid) {
            if (cid > 0xffff) {
              throw new FormatError('Max size of CID is 65,535');
            }
            // e) Map the CID obtained in step (a) according to the CMap
            // obtained in step (d), producing a Unicode value.
-            var ucs2 = ucs2CMap.lookup(cid);
+            let ucs2 = ucs2CMap.lookup(cid);
            if (ucs2) {
              toUnicode[charcode] =
                String.fromCharCode((ucs2.charCodeAt(0) << 8) +