Merge pull request #7066 from Snuffleupagus/Type1-headerBlockLength

Parse Type1 font files to determine the various `Length{n}` properties, instead of trusting the PDF file (issue 5686, issue 3928)
2016-03-31 09:25:16 -07:00 · 2016-03-31 09:25:16 -07:00 · 8910cea7d6
commit 8910cea7d6
parent 447c48ea27 05cf709f8e
6 changed files with 355 additions and 213 deletions
--- a/src/core/evaluator.js
+++ b/src/core/evaluator.js
@ -2197,6 +2197,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
          }
          var length1 = fontFile.dict.get('Length1');
          var length2 = fontFile.dict.get('Length2');
+          var length3 = fontFile.dict.get('Length3');
        }
      }

@ -2207,6 +2208,7 @@ var PartialEvaluator = (function PartialEvaluatorClosure() {
        file: fontFile,
        length1: length1,
        length2: length2,
+        length3: length3,
        loadedName: baseDict.loadedName,
        composite: composite,
        wideChars: composite,
--- a/src/core/fonts.js
+++ b/src/core/fonts.js
@ -54,6 +54,7 @@ var shadow = sharedUtil.shadow;
 var stringToBytes = sharedUtil.stringToBytes;
 var string32 = sharedUtil.string32;
 var warn = sharedUtil.warn;
+var MissingDataException = sharedUtil.MissingDataException;
 var Stream = coreStream.Stream;
 var Lexer = coreParser.Lexer;
 var getGlyphsUnicode = coreGlyphList.getGlyphsUnicode;
@ -3637,231 +3638,352 @@ var CFFStandardStrings = [
 ];

 // Type1Font is also a CIDFontType0.
-var Type1Font = function Type1Font(name, file, properties) {
-  // Some bad generators embed pfb file as is, we have to strip 6-byte headers.
-  // Also, length1 and length2 might be off by 6 bytes as well.
-  // http://www.math.ubc.ca/~cass/piscript/type1.pdf
-  var PFB_HEADER_SIZE = 6;
-  var headerBlockLength = properties.length1;
-  var eexecBlockLength = properties.length2;
-  var pfbHeader = file.peekBytes(PFB_HEADER_SIZE);
-  var pfbHeaderPresent = pfbHeader[0] === 0x80 && pfbHeader[1] === 0x01;
-  if (pfbHeaderPresent) {
-    file.skip(PFB_HEADER_SIZE);
-    headerBlockLength = (pfbHeader[5] << 24) | (pfbHeader[4] << 16) |
-                        (pfbHeader[3] << 8) | pfbHeader[2];
-  }
+var Type1Font = (function Type1FontClosure() {
+  function findBlock(streamBytes, signature, startIndex) {
+    var streamBytesLength = streamBytes.length;
+    var signatureLength = signature.length;
+    var scanLength = streamBytesLength - signatureLength;

-  // Get the data block containing glyphs and subrs informations
-  var headerBlock = new Stream(file.getBytes(headerBlockLength));
-  var headerBlockParser = new Type1Parser(headerBlock);
-  headerBlockParser.extractFontHeader(properties);
-
-  if (pfbHeaderPresent) {
-    pfbHeader = file.getBytes(PFB_HEADER_SIZE);
-    eexecBlockLength = (pfbHeader[5] << 24) | (pfbHeader[4] << 16) |
-                       (pfbHeader[3] << 8) | pfbHeader[2];
-  }
-
-  // Decrypt the data blocks and retrieve it's content
-  var eexecBlock = new Stream(file.getBytes(eexecBlockLength));
-  var eexecBlockParser = new Type1Parser(eexecBlock, true);
-  var data = eexecBlockParser.extractFontProgram();
-  for (var info in data.properties) {
-    properties[info] = data.properties[info];
-  }
-
-  var charstrings = data.charstrings;
-  var type2Charstrings = this.getType2Charstrings(charstrings);
-  var subrs = this.getType2Subrs(data.subrs);
-
-  this.charstrings = charstrings;
-  this.data = this.wrap(name, type2Charstrings, this.charstrings,
-                        subrs, properties);
-  this.seacs = this.getSeacs(data.charstrings);
-};
-
-Type1Font.prototype = {
-  get numGlyphs() {
-    return this.charstrings.length + 1;
-  },
-
-  getCharset: function Type1Font_getCharset() {
-    var charset = ['.notdef'];
-    var charstrings = this.charstrings;
-    for (var glyphId = 0; glyphId < charstrings.length; glyphId++) {
-      charset.push(charstrings[glyphId].glyphName);
+    var i = startIndex, j, found = false;
+    while (i < scanLength) {
+      j = 0;
+      while (j < signatureLength && streamBytes[i + j] === signature[j]) {
+        j++;
+      }
+      if (j >= signatureLength) { // `signature` found, skip over whitespace.
+        i += j;
+        while (i < streamBytesLength && Lexer.isSpace(streamBytes[i])) {
+          i++;
+        }
+        found = true;
+        break;
+      }
+      i++;
    }
-    return charset;
-  },
+    return {
+      found: found,
+      length: i,
+    };
+  }

-  getGlyphMapping: function Type1Font_getGlyphMapping(properties) {
-    var charstrings = this.charstrings;
-    var glyphNames = ['.notdef'], glyphId;
-    for (glyphId = 0; glyphId < charstrings.length; glyphId++) {
-      glyphNames.push(charstrings[glyphId].glyphName);
+  function getHeaderBlock(stream, suggestedLength) {
+    var EEXEC_SIGNATURE = [0x65, 0x65, 0x78, 0x65, 0x63];
+
+    var streamStartPos = stream.pos; // Save the initial stream position.
+    var headerBytes, headerBytesLength, block;
+    try {
+      headerBytes = stream.getBytes(suggestedLength);
+      headerBytesLength = headerBytes.length;
+    } catch (ex) {
+      if (ex instanceof MissingDataException) {
+        throw ex;
+      }
+      // Ignore errors if the `suggestedLength` is huge enough that a Uint8Array
+      // cannot hold the result of `getBytes`, and fallback to simply checking
+      // the entire stream (fixes issue3928.pdf).
    }
-    var encoding = properties.builtInEncoding;
-    if (encoding) {
-      var builtInEncoding = Object.create(null);
-      for (var charCode in encoding) {
-        glyphId = glyphNames.indexOf(encoding[charCode]);
-        if (glyphId >= 0) {
-          builtInEncoding[charCode] = glyphId;
+
+    if (headerBytesLength === suggestedLength) {
+      // Most of the time `suggestedLength` is correct, so to speed things up we
+      // initially only check the last few bytes to see if the header was found.
+      // Otherwise we (potentially) check the entire stream to prevent errors in
+      // `Type1Parser` (fixes issue5686.pdf).
+      block = findBlock(headerBytes, EEXEC_SIGNATURE,
+                        suggestedLength - 2 * EEXEC_SIGNATURE.length);
+
+      if (block.found && block.length === suggestedLength) {
+        return {
+          stream: new Stream(headerBytes),
+          length: suggestedLength,
+        };
+      }
+    }
+    warn('Invalid "Length1" property in Type1 font -- trying to recover.');
+    stream.pos = streamStartPos; // Reset the stream position.
+
+    var SCAN_BLOCK_LENGTH = 2048;
+    var actualLength;
+    while (true) {
+      var scanBytes = stream.peekBytes(SCAN_BLOCK_LENGTH);
+      block = findBlock(scanBytes, EEXEC_SIGNATURE, 0);
+
+      if (block.length === 0) {
+        break;
+      }
+      stream.pos += block.length; // Update the stream position.
+
+      if (block.found) {
+        actualLength = stream.pos - streamStartPos;
+        break;
+      }
+    }
+    stream.pos = streamStartPos; // Reset the stream position.
+
+    if (actualLength) {
+      return {
+        stream: new Stream(stream.getBytes(actualLength)),
+        length: actualLength,
+      };
+    }
+    warn('Unable to recover "Length1" property in Type1 font -- using as is.');
+    return {
+      stream: new Stream(stream.getBytes(suggestedLength)),
+      length: suggestedLength,
+    };
+  }
+
+  function getEexecBlock(stream, suggestedLength) {
+    // We should ideally parse the eexec block to ensure that `suggestedLength`
+    // is correct, so we don't truncate the block data if it's too small.
+    // However, this would also require checking if the fixed-content portion
+    // exists (using the 'Length3' property), and ensuring that it's valid.
+    //
+    // Given that `suggestedLength` almost always is correct, all the validation
+    // would require a great deal of unnecessary parsing for most fonts.
+    // To save time, we always fetch the entire stream instead, which also avoid
+    // issues if `suggestedLength` is huge (see comment in `getHeaderBlock`).
+    //
+    // NOTE: This means that the function can include the fixed-content portion
+    // in the returned eexec block. In practice this does *not* seem to matter,
+    // since `Type1Parser_extractFontProgram` will skip over any non-commands.
+    var eexecBytes = stream.getBytes();
+    return {
+      stream: new Stream(eexecBytes),
+      length: eexecBytes.length,
+    };
+  }
+
+  function Type1Font(name, file, properties) {
+    // Some bad generators embed pfb file as is, we have to strip 6-byte header.
+    // Also, length1 and length2 might be off by 6 bytes as well.
+    // http://www.math.ubc.ca/~cass/piscript/type1.pdf
+    var PFB_HEADER_SIZE = 6;
+    var headerBlockLength = properties.length1;
+    var eexecBlockLength = properties.length2;
+    var pfbHeader = file.peekBytes(PFB_HEADER_SIZE);
+    var pfbHeaderPresent = pfbHeader[0] === 0x80 && pfbHeader[1] === 0x01;
+    if (pfbHeaderPresent) {
+      file.skip(PFB_HEADER_SIZE);
+      headerBlockLength = (pfbHeader[5] << 24) | (pfbHeader[4] << 16) |
+                          (pfbHeader[3] << 8) | pfbHeader[2];
+    }
+
+    // Get the data block containing glyphs and subrs informations
+    var headerBlock = getHeaderBlock(file, headerBlockLength);
+    headerBlockLength = headerBlock.length;
+    var headerBlockParser = new Type1Parser(headerBlock.stream);
+    headerBlockParser.extractFontHeader(properties);
+
+    if (pfbHeaderPresent) {
+      pfbHeader = file.getBytes(PFB_HEADER_SIZE);
+      eexecBlockLength = (pfbHeader[5] << 24) | (pfbHeader[4] << 16) |
+                         (pfbHeader[3] << 8) | pfbHeader[2];
+    }
+
+    // Decrypt the data blocks and retrieve it's content
+    var eexecBlock = getEexecBlock(file, eexecBlockLength);
+    eexecBlockLength = eexecBlock.length;
+    var eexecBlockParser = new Type1Parser(eexecBlock.stream, true);
+    var data = eexecBlockParser.extractFontProgram();
+    for (var info in data.properties) {
+      properties[info] = data.properties[info];
+    }
+
+    var charstrings = data.charstrings;
+    var type2Charstrings = this.getType2Charstrings(charstrings);
+    var subrs = this.getType2Subrs(data.subrs);
+
+    this.charstrings = charstrings;
+    this.data = this.wrap(name, type2Charstrings, this.charstrings,
+                          subrs, properties);
+    this.seacs = this.getSeacs(data.charstrings);
+  }
+
+  Type1Font.prototype = {
+    get numGlyphs() {
+      return this.charstrings.length + 1;
+    },
+
+    getCharset: function Type1Font_getCharset() {
+      var charset = ['.notdef'];
+      var charstrings = this.charstrings;
+      for (var glyphId = 0; glyphId < charstrings.length; glyphId++) {
+        charset.push(charstrings[glyphId].glyphName);
+      }
+      return charset;
+    },
+
+    getGlyphMapping: function Type1Font_getGlyphMapping(properties) {
+      var charstrings = this.charstrings;
+      var glyphNames = ['.notdef'], glyphId;
+      for (glyphId = 0; glyphId < charstrings.length; glyphId++) {
+        glyphNames.push(charstrings[glyphId].glyphName);
+      }
+      var encoding = properties.builtInEncoding;
+      if (encoding) {
+        var builtInEncoding = Object.create(null);
+        for (var charCode in encoding) {
+          glyphId = glyphNames.indexOf(encoding[charCode]);
+          if (glyphId >= 0) {
+            builtInEncoding[charCode] = glyphId;
+          }
        }
      }
-    }

-    return type1FontGlyphMapping(properties, builtInEncoding, glyphNames);
-  },
+      return type1FontGlyphMapping(properties, builtInEncoding, glyphNames);
+    },

-  getSeacs: function Type1Font_getSeacs(charstrings) {
-    var i, ii;
-    var seacMap = [];
-    for (i = 0, ii = charstrings.length; i < ii; i++) {
-      var charstring = charstrings[i];
-      if (charstring.seac) {
-        // Offset by 1 for .notdef
-        seacMap[i + 1] = charstring.seac;
-      }
-    }
-    return seacMap;
-  },
-
-  getType2Charstrings: function Type1Font_getType2Charstrings(
-                                  type1Charstrings) {
-    var type2Charstrings = [];
-    for (var i = 0, ii = type1Charstrings.length; i < ii; i++) {
-      type2Charstrings.push(type1Charstrings[i].charstring);
-    }
-    return type2Charstrings;
-  },
-
-  getType2Subrs: function Type1Font_getType2Subrs(type1Subrs) {
-    var bias = 0;
-    var count = type1Subrs.length;
-    if (count < 1133) {
-      bias = 107;
-    } else if (count < 33769) {
-      bias = 1131;
-    } else {
-      bias = 32768;
-    }
-
-    // Add a bunch of empty subrs to deal with the Type2 bias
-    var type2Subrs = [];
-    var i;
-    for (i = 0; i < bias; i++) {
-      type2Subrs.push([0x0B]);
-    }
-
-    for (i = 0; i < count; i++) {
-      type2Subrs.push(type1Subrs[i]);
-    }
-
-    return type2Subrs;
-  },
-
-  wrap: function Type1Font_wrap(name, glyphs, charstrings, subrs, properties) {
-    var cff = new CFF();
-    cff.header = new CFFHeader(1, 0, 4, 4);
-
-    cff.names = [name];
-
-    var topDict = new CFFTopDict();
-    // CFF strings IDs 0...390 are predefined names, so refering
-    // to entries in our own String INDEX starts at SID 391.
-    topDict.setByName('version', 391);
-    topDict.setByName('Notice', 392);
-    topDict.setByName('FullName', 393);
-    topDict.setByName('FamilyName', 394);
-    topDict.setByName('Weight', 395);
-    topDict.setByName('Encoding', null); // placeholder
-    topDict.setByName('FontMatrix', properties.fontMatrix);
-    topDict.setByName('FontBBox', properties.bbox);
-    topDict.setByName('charset', null); // placeholder
-    topDict.setByName('CharStrings', null); // placeholder
-    topDict.setByName('Private', null); // placeholder
-    cff.topDict = topDict;
-
-    var strings = new CFFStrings();
-    strings.add('Version 0.11'); // Version
-    strings.add('See original notice'); // Notice
-    strings.add(name); // FullName
-    strings.add(name); // FamilyName
-    strings.add('Medium'); // Weight
-    cff.strings = strings;
-
-    cff.globalSubrIndex = new CFFIndex();
-
-    var count = glyphs.length;
-    var charsetArray = [0];
-    var i, ii;
-    for (i = 0; i < count; i++) {
-      var index = CFFStandardStrings.indexOf(charstrings[i].glyphName);
-      // TODO: Insert the string and correctly map it.  Previously it was
-      // thought mapping names that aren't in the standard strings to .notdef
-      // was fine, however in issue818 when mapping them all to .notdef the
-      // adieresis glyph no longer worked.
-      if (index === -1) {
-        index = 0;
-      }
-      charsetArray.push((index >> 8) & 0xff, index & 0xff);
-    }
-    cff.charset = new CFFCharset(false, 0, [], charsetArray);
-
-    var charStringsIndex = new CFFIndex();
-    charStringsIndex.add([0x8B, 0x0E]); // .notdef
-    for (i = 0; i < count; i++) {
-      charStringsIndex.add(glyphs[i]);
-    }
-    cff.charStrings = charStringsIndex;
-
-    var privateDict = new CFFPrivateDict();
-    privateDict.setByName('Subrs', null); // placeholder
-    var fields = [
-      'BlueValues',
-      'OtherBlues',
-      'FamilyBlues',
-      'FamilyOtherBlues',
-      'StemSnapH',
-      'StemSnapV',
-      'BlueShift',
-      'BlueFuzz',
-      'BlueScale',
-      'LanguageGroup',
-      'ExpansionFactor',
-      'ForceBold',
-      'StdHW',
-      'StdVW'
-    ];
-    for (i = 0, ii = fields.length; i < ii; i++) {
-      var field = fields[i];
-      if (!(field in properties.privateData)) {
-        continue;
-      }
-      var value = properties.privateData[field];
-      if (isArray(value)) {
-        // All of the private dictionary array data in CFF must be stored as
-        // "delta-encoded" numbers.
-        for (var j = value.length - 1; j > 0; j--) {
-          value[j] -= value[j - 1]; // ... difference from previous value
+    getSeacs: function Type1Font_getSeacs(charstrings) {
+      var i, ii;
+      var seacMap = [];
+      for (i = 0, ii = charstrings.length; i < ii; i++) {
+        var charstring = charstrings[i];
+        if (charstring.seac) {
+          // Offset by 1 for .notdef
+          seacMap[i + 1] = charstring.seac;
        }
      }
-      privateDict.setByName(field, value);
-    }
-    cff.topDict.privateDict = privateDict;
+      return seacMap;
+    },

-    var subrIndex = new CFFIndex();
-    for (i = 0, ii = subrs.length; i < ii; i++) {
-      subrIndex.add(subrs[i]);
-    }
-    privateDict.subrsIndex = subrIndex;
+    getType2Charstrings: function Type1Font_getType2Charstrings(
+                                    type1Charstrings) {
+      var type2Charstrings = [];
+      for (var i = 0, ii = type1Charstrings.length; i < ii; i++) {
+        type2Charstrings.push(type1Charstrings[i].charstring);
+      }
+      return type2Charstrings;
+    },

-    var compiler = new CFFCompiler(cff);
-    return compiler.compile();
-  }
-};
+    getType2Subrs: function Type1Font_getType2Subrs(type1Subrs) {
+      var bias = 0;
+      var count = type1Subrs.length;
+      if (count < 1133) {
+        bias = 107;
+      } else if (count < 33769) {
+        bias = 1131;
+      } else {
+        bias = 32768;
+      }
+
+      // Add a bunch of empty subrs to deal with the Type2 bias
+      var type2Subrs = [];
+      var i;
+      for (i = 0; i < bias; i++) {
+        type2Subrs.push([0x0B]);
+      }
+
+      for (i = 0; i < count; i++) {
+        type2Subrs.push(type1Subrs[i]);
+      }
+
+      return type2Subrs;
+    },
+
+    wrap: function Type1Font_wrap(name, glyphs, charstrings, subrs,
+                                  properties) {
+      var cff = new CFF();
+      cff.header = new CFFHeader(1, 0, 4, 4);
+
+      cff.names = [name];
+
+      var topDict = new CFFTopDict();
+      // CFF strings IDs 0...390 are predefined names, so refering
+      // to entries in our own String INDEX starts at SID 391.
+      topDict.setByName('version', 391);
+      topDict.setByName('Notice', 392);
+      topDict.setByName('FullName', 393);
+      topDict.setByName('FamilyName', 394);
+      topDict.setByName('Weight', 395);
+      topDict.setByName('Encoding', null); // placeholder
+      topDict.setByName('FontMatrix', properties.fontMatrix);
+      topDict.setByName('FontBBox', properties.bbox);
+      topDict.setByName('charset', null); // placeholder
+      topDict.setByName('CharStrings', null); // placeholder
+      topDict.setByName('Private', null); // placeholder
+      cff.topDict = topDict;
+
+      var strings = new CFFStrings();
+      strings.add('Version 0.11'); // Version
+      strings.add('See original notice'); // Notice
+      strings.add(name); // FullName
+      strings.add(name); // FamilyName
+      strings.add('Medium'); // Weight
+      cff.strings = strings;
+
+      cff.globalSubrIndex = new CFFIndex();
+
+      var count = glyphs.length;
+      var charsetArray = [0];
+      var i, ii;
+      for (i = 0; i < count; i++) {
+        var index = CFFStandardStrings.indexOf(charstrings[i].glyphName);
+        // TODO: Insert the string and correctly map it.  Previously it was
+        // thought mapping names that aren't in the standard strings to .notdef
+        // was fine, however in issue818 when mapping them all to .notdef the
+        // adieresis glyph no longer worked.
+        if (index === -1) {
+          index = 0;
+        }
+        charsetArray.push((index >> 8) & 0xff, index & 0xff);
+      }
+      cff.charset = new CFFCharset(false, 0, [], charsetArray);
+
+      var charStringsIndex = new CFFIndex();
+      charStringsIndex.add([0x8B, 0x0E]); // .notdef
+      for (i = 0; i < count; i++) {
+        charStringsIndex.add(glyphs[i]);
+      }
+      cff.charStrings = charStringsIndex;
+
+      var privateDict = new CFFPrivateDict();
+      privateDict.setByName('Subrs', null); // placeholder
+      var fields = [
+        'BlueValues',
+        'OtherBlues',
+        'FamilyBlues',
+        'FamilyOtherBlues',
+        'StemSnapH',
+        'StemSnapV',
+        'BlueShift',
+        'BlueFuzz',
+        'BlueScale',
+        'LanguageGroup',
+        'ExpansionFactor',
+        'ForceBold',
+        'StdHW',
+        'StdVW'
+      ];
+      for (i = 0, ii = fields.length; i < ii; i++) {
+        var field = fields[i];
+        if (!(field in properties.privateData)) {
+          continue;
+        }
+        var value = properties.privateData[field];
+        if (isArray(value)) {
+          // All of the private dictionary array data in CFF must be stored as
+          // "delta-encoded" numbers.
+          for (var j = value.length - 1; j > 0; j--) {
+            value[j] -= value[j - 1]; // ... difference from previous value
+          }
+        }
+        privateDict.setByName(field, value);
+      }
+      cff.topDict.privateDict = privateDict;
+
+      var subrIndex = new CFFIndex();
+      for (i = 0, ii = subrs.length; i < ii; i++) {
+        subrIndex.add(subrs[i]);
+      }
+      privateDict.subrsIndex = subrIndex;
+
+      var compiler = new CFFCompiler(cff);
+      return compiler.compile();
+    }
+  };
+
+  return Type1Font;
+})();

 var CFFFont = (function CFFFontClosure() {
  function CFFFont(file, properties) {
--- a/test/pdfs/.gitignore
+++ b/test/pdfs/.gitignore
@ -49,6 +49,8 @@
 !issue3207r.pdf
 !issue3263r.pdf
 !issue3879r.pdf
+!issue5686.pdf
+!issue3928.pdf
 !close-path-bug.pdf
 !issue6019.pdf
 !issue6621.pdf
--- a/test/pdfs/issue3928.pdf
+++ b/test/pdfs/issue3928.pdf
--- a/test/pdfs/issue5686.pdf
+++ b/test/pdfs/issue5686.pdf
--- a/test/test_manifest.json
+++ b/test/test_manifest.json
@ -1443,6 +1443,22 @@
      "link": false,
      "type": "eq"
    },
+    {  "id": "issue5686",
+       "file": "pdfs/issue5686.pdf",
+       "md5": "78d16b9df07a355ad00d70504a9194f8",
+       "rounds": 1,
+       "link": false,
+       "type": "eq",
+       "about": "Type1 font where Length1/Length2 are slightly incorrect."
+    },
+    {  "id": "issue3928",
+       "file": "pdfs/issue3928.pdf",
+       "md5": "1963493f843e981cbe768b707ef7f08a",
+       "rounds": 1,
+       "link": false,
+       "type": "eq",
+       "about": "Type1 font where Length1/Length2 are several orders of magnitude too large."
+    },
    {  "id": "html5checker",
      "file": "pdfs/html5checker.pdf",
      "md5": "74bbd80d1e7eb5f2951582233ef9ebab",