Slightly re-factor PartialEvaluator._simpleFontToUnicode

Given the sheer number of heuristics added to this method over the years, moving the *valid* unicode found case to the top should improve readability of the code.
2022-10-05 12:52:22 +02:00 · 2022-10-05 12:52:22 +02:00 · fa47d4b9b1
commit fa47d4b9b1
parent c6cc7c6e6a
1 changed files with 67 additions and 67 deletions
--- a/src/core/evaluator.js
+++ b/src/core/evaluator.js
@ -3526,77 +3526,77 @@ class PartialEvaluator {
    for (const charcode in encoding) {
      // a) Map the character code to a character name.
      let glyphName = encoding[charcode];
      // b) Look up the character name in the Adobe Glyph List (see the
      //    Bibliography) to obtain the corresponding Unicode value.
      if (glyphName === "") {
        continue;
-      } else if (glyphsUnicodeMap[glyphName] === undefined) {
+      }
-        // (undocumented) c) Few heuristics to recognize unknown glyphs
+      // b) Look up the character name in the Adobe Glyph List (see the
-        // NOTE: Adobe Reader does not do this step, but OSX Preview does
+      //    Bibliography) to obtain the corresponding Unicode value.
-        let code = 0;
+      let unicode = glyphsUnicodeMap[glyphName];
-        switch (glyphName[0]) {
+      if (unicode !== undefined) {
-          case "G": // Gxx glyph
+        toUnicode[charcode] = String.fromCharCode(unicode);
            if (glyphName.length === 3) {
              code = parseInt(glyphName.substring(1), 16);
            }
            break;
          case "g": // g00xx glyph
            if (glyphName.length === 5) {
              code = parseInt(glyphName.substring(1), 16);
            }
            break;
          case "C": // Cdd{d} glyph
          case "c": // cdd{d} glyph
            if (glyphName.length >= 3 && glyphName.length <= 4) {
              const codeStr = glyphName.substring(1);
              if (forceGlyphs) {
                code = parseInt(codeStr, 16);
                break;
              }
              // Normally the Cdd{d}/cdd{d} glyphName format will contain
              // regular, i.e. base 10, charCodes (see issue4550.pdf)...
              code = +codeStr;
              // ... however some PDF generators violate that assumption by
              // containing glyph, i.e. base 16, codes instead.
              // In that case we need to re-parse the *entire* encoding to
              // prevent broken text-selection (fixes issue9655_reduced.pdf).
              if (
                Number.isNaN(code) &&
                Number.isInteger(parseInt(codeStr, 16))
              ) {
                return this._simpleFontToUnicode(
                  properties,
                  /* forceGlyphs */ true
                );
              }
            }
            break;
          default: // 'uniXXXX'/'uXXXX{XX}' glyphs
            const unicode = getUnicodeForGlyph(glyphName, glyphsUnicodeMap);
            if (unicode !== -1) {
              code = unicode;
            }
        }
        if (code > 0 && code <= 0x10ffff && Number.isInteger(code)) {
          // If `baseEncodingName` is one the predefined encodings, and `code`
          // equals `charcode`, using the glyph defined in the baseEncoding
          // seems to yield a better `toUnicode` mapping (fixes issue 5070).
          if (baseEncodingName && code === +charcode) {
            const baseEncoding = getEncoding(baseEncodingName);
            if (baseEncoding && (glyphName = baseEncoding[charcode])) {
              toUnicode[charcode] = String.fromCharCode(
                glyphsUnicodeMap[glyphName]
              );
              continue;
            }
          }
          toUnicode[charcode] = String.fromCodePoint(code);
        }
        continue;
      }
-      toUnicode[charcode] = String.fromCharCode(glyphsUnicodeMap[glyphName]);
+      // (undocumented) c) Few heuristics to recognize unknown glyphs
      // NOTE: Adobe Reader does not do this step, but OSX Preview does
      let code = 0;
      switch (glyphName[0]) {
        case "G": // Gxx glyph
          if (glyphName.length === 3) {
            code = parseInt(glyphName.substring(1), 16);
          }
          break;
        case "g": // g00xx glyph
          if (glyphName.length === 5) {
            code = parseInt(glyphName.substring(1), 16);
          }
          break;
        case "C": // Cdd{d} glyph
        case "c": // cdd{d} glyph
          if (glyphName.length >= 3 && glyphName.length <= 4) {
            const codeStr = glyphName.substring(1);
            if (forceGlyphs) {
              code = parseInt(codeStr, 16);
              break;
            }
            // Normally the Cdd{d}/cdd{d} glyphName format will contain
            // regular, i.e. base 10, charCodes (see issue4550.pdf)...
            code = +codeStr;
            // ... however some PDF generators violate that assumption by
            // containing glyph, i.e. base 16, codes instead.
            // In that case we need to re-parse the *entire* encoding to
            // prevent broken text-selection (fixes issue9655_reduced.pdf).
            if (Number.isNaN(code) && Number.isInteger(parseInt(codeStr, 16))) {
              return this._simpleFontToUnicode(
                properties,
                /* forceGlyphs */ true
              );
            }
          }
          break;
        case "u": // 'uniXXXX'/'uXXXX{XX}' glyphs
          unicode = getUnicodeForGlyph(glyphName, glyphsUnicodeMap);
          if (unicode !== -1) {
            code = unicode;
          }
          break;
      }
      if (code > 0 && code <= 0x10ffff && Number.isInteger(code)) {
        // If `baseEncodingName` is one the predefined encodings, and `code`
        // equals `charcode`, using the glyph defined in the baseEncoding
        // seems to yield a better `toUnicode` mapping (fixes issue 5070).
        if (baseEncodingName && code === +charcode) {
          const baseEncoding = getEncoding(baseEncodingName);
          if (baseEncoding && (glyphName = baseEncoding[charcode])) {
            toUnicode[charcode] = String.fromCharCode(
              glyphsUnicodeMap[glyphName]
            );
            continue;
          }
        }
        toUnicode[charcode] = String.fromCodePoint(code);
      }
    }
    return toUnicode;
  }