Attempt to improve the EI detection heuristics, for inline images, in streams containing NUL bytes (issue 8823)

Since this patch will now treat (some) `NUL` bytes as "ASCII", the number of `followingBytes` checked are thus increased to (hopefully) reduce the risk of introducing new false positives. Fixes 8823.
2017-08-26 12:09:49 +02:00 · 2017-08-26 12:09:49 +02:00 · 49b8cd5a6a
commit 49b8cd5a6a
parent 7cc7260634
4 changed files with 26 additions and 5 deletions
--- a/src/core/parser.js
+++ b/src/core/parser.js
@ -148,7 +148,8 @@ var Parser = (function ParserClosure() {
     * @returns {number} The inline stream length.
     */
    findDefaultInlineStreamEnd(stream) {
-      const E = 0x45, I = 0x49, SPACE = 0x20, LF = 0xA, CR = 0xD, n = 5;
+      const E = 0x45, I = 0x49, SPACE = 0x20, LF = 0xA, CR = 0xD;
      const n = 10, NUL = 0x0;
      let startPos = stream.pos, state = 0, ch, maybeEIPos;
      while ((ch = stream.getByte()) !== -1) {
        if (state === 0) {
@ -159,10 +160,23 @@ var Parser = (function ParserClosure() {
          assert(state === 2);
          if (ch === SPACE || ch === LF || ch === CR) {
            maybeEIPos = stream.pos;
-            // Let's check the next `n` bytes are ASCII... just be sure.
+            // Let's check that the next `n` bytes are ASCII... just to be sure.
            let followingBytes = stream.peekBytes(n);
-            for (let i = 0; i < n; i++) {
+            for (let i = 0, ii = followingBytes.length; i < ii; i++) {
              ch = followingBytes[i];
              if (ch === NUL && followingBytes[i + 1] !== NUL) {
                // NUL bytes are not supposed to occur *outside* of inline
                // images, but some PDF generators violate that assumption,
                // thus breaking the EI detection heuristics used below.
                //
                // However, we can't unconditionally treat NUL bytes as "ASCII",
                // since that *could* result in inline images being truncated.
                //
                // To attempt to address this, we'll still treat any *sequence*
                // of NUL bytes as non-ASCII, but for a *single* NUL byte we'll
                // continue checking the `followingBytes` (fixes issue8823.pdf).
                continue;
              }
              if (ch !== LF && ch !== CR && (ch < SPACE || ch > 0x7F)) {
                // Not a LF, CR, SPACE or any visible ASCII character, i.e.
                // it's binary stuff. Resetting the state.
--- a/test/pdfs/.gitignore
+++ b/test/pdfs/.gitignore
@ -58,6 +58,7 @@
 !issue8697.pdf
 !issue8707.pdf
 !issue8798r.pdf
 !issue8823.pdf
 !bad-PageLabels.pdf
 !filled-background.pdf
 !ArabicCIDTrueType.pdf
--- a/test/pdfs/issue8823.pdf
+++ b/test/pdfs/issue8823.pdf
--- a/test/test_manifest.json
+++ b/test/test_manifest.json
@ -2976,9 +2976,15 @@
    {  "id": "issue8798",
       "file": "pdfs/issue8798r.pdf",
       "md5": "3a0e29f013d9edcceb5d852e37738a77",
       "link": false,
       "rounds": 1,
       "type": "eq"
    },
    {  "id": "issue8823",
       "file": "pdfs/issue8823.pdf",
       "md5": "ad02d4aa374b315bf1766038d002d57a",
       "link": false,
       "rounds": 1,
       "lastPage": 1,
       "link": true,
       "type": "eq"
    },
    {  "id": "issue8613",