Improve "EI" detection in inline images (PR 12028 follow-up, issue 16454)

Given that inline images may contain "EI"-sequences in the image-data itself, actually finding the end-of-image operator isn't always straightforward. Here we extend the implementation from PR 12028 to potentially check all of the following bytes, rather than stopping immediately. While we have fairly decent test-coverage for this code, whenever you're changing it there's unfortunately a slightly higher than normal risk of regressions. (You'd really wish that PDF generators just stop using inline images.)
2023-05-23 13:08:02 +02:00 · 2023-05-23 13:08:02 +02:00 · dfbbb8c0ac
commit dfbbb8c0ac
parent 094fb3c783
3 changed files with 56 additions and 36 deletions
--- a/src/core/parser.js
+++ b/src/core/parser.js
@ -26,6 +26,7 @@ import {
  MissingDataException,
  ParserEOFException,
 } from "./core_utils.js";
+import { NullStream, Stream } from "./stream.js";
 import { Ascii85Stream } from "./ascii_85_stream.js";
 import { AsciiHexStream } from "./ascii_hex_stream.js";
 import { CCITTFaxStream } from "./ccitt_stream.js";
@ -34,7 +35,6 @@ import { Jbig2Stream } from "./jbig2_stream.js";
 import { JpegStream } from "./jpeg_stream.js";
 import { JpxStream } from "./jpx_stream.js";
 import { LZWStream } from "./lzw_stream.js";
-import { NullStream } from "./stream.js";
 import { PredictorStream } from "./predictor_stream.js";
 import { RunLengthStream } from "./run_length_stream.js";

@ -190,9 +190,9 @@ class Parser {
      LF = 0xa,
      CR = 0xd,
      NUL = 0x0;
-    const lexer = this.lexer,
+    const { knownCommands } = this.lexer,
      startPos = stream.pos,
-      n = 10;
+      n = 15;
    let state = 0,
      ch,
      maybeEIPos;
@ -209,7 +209,12 @@ class Parser {
          maybeEIPos = stream.pos;
          // Let's check that the next `n` bytes are ASCII... just to be sure.
          const followingBytes = stream.peekBytes(n);
-          for (let i = 0, ii = followingBytes.length; i < ii; i++) {
+
+          const ii = followingBytes.length;
+          if (ii === 0) {
+            break; // The end of the stream was reached, nothing to check.
+          }
+          for (let i = 0; i < ii; i++) {
            ch = followingBytes[i];
            if (ch === NUL && followingBytes[i + 1] !== NUL) {
              // NUL bytes are not supposed to occur *outside* of inline
@ -235,19 +240,47 @@ class Parser {
          if (state !== 2) {
            continue;
          }
-          // Check that the "EI" sequence isn't part of the image data, since
-          // that would cause the image to be truncated (fixes issue11124.pdf).
-          if (lexer.knownCommands) {
-            const nextObj = lexer.peekObj();
-            if (nextObj instanceof Cmd && !lexer.knownCommands[nextObj.cmd]) {
-              // Not a valid command, i.e. the inline image data *itself*
-              // contains an "EI" sequence. Resetting the state.
-              state = 0;
-            }
-          } else {
+          if (!knownCommands) {
            warn(
              "findDefaultInlineStreamEnd - `lexer.knownCommands` is undefined."
            );
+            continue;
+          }
+          // Check that the "EI" sequence isn't part of the image data, since
+          // that would cause the image to be truncated (fixes issue11124.pdf).
+          const tmpLexer = new Lexer(
+            new Stream(followingBytes.slice()),
+            knownCommands
+          );
+          // Reduce the number of (potential) warning messages.
+          tmpLexer._hexStringWarn = () => {};
+          let numArgs = 0;
+
+          while (true) {
+            const nextObj = tmpLexer.getObj();
+
+            if (nextObj === EOF) {
+              state = 0; // No valid command found, resetting the state.
+              break;
+            }
+            if (nextObj instanceof Cmd) {
+              const knownCommand = knownCommands[nextObj.cmd];
+              if (!knownCommand) {
+                // Not a valid command, i.e. the inline image data *itself*
+                // contains an "EI" sequence. Resetting the state.
+                state = 0;
+                break;
+              } else if (
+                knownCommand.variableArgs
+                  ? numArgs <= knownCommand.numArgs
+                  : numArgs === knownCommand.numArgs
+              ) {
+                break; // Valid command found.
+              }
+              numArgs = 0;
+              continue;
+            }
+            numArgs++;
          }

          if (state === 2) {
@ -1284,28 +1317,6 @@ class Lexer {
    return Cmd.get(str);
  }

-  peekObj() {
-    const streamPos = this.stream.pos,
-      currentChar = this.currentChar,
-      beginInlineImagePos = this.beginInlineImagePos;
-
-    let nextObj;
-    try {
-      nextObj = this.getObj();
-    } catch (ex) {
-      if (ex instanceof MissingDataException) {
-        throw ex;
-      }
-      warn(`peekObj: ${ex}`);
-    }
-    // Ensure that we reset *all* relevant `Lexer`-instance state.
-    this.stream.pos = streamPos;
-    this.currentChar = currentChar;
-    this.beginInlineImagePos = beginInlineImagePos;
-
-    return nextObj;
-  }
-
  skipToNextLine() {
    let ch = this.currentChar;
    while (ch >= 0) {
--- a/test/pdfs/issue16454.pdf.link
+++ b/test/pdfs/issue16454.pdf.link
@ -0,0 +1 @@
+https://github.com/mozilla/pdf.js/files/11537582/Pages.62.73.from.0560-22_WSP.Plan_July.2022_Version.1.pdf
--- a/test/test_manifest.json
+++ b/test/test_manifest.json
@ -5963,6 +5963,14 @@
       "link": true,
       "type": "eq"
    },
+    {  "id": "issue16454",
+       "file": "pdfs/issue16454.pdf",
+       "md5": "82fe0c54a96667472ce999be7a789199",
+       "rounds": 1,
+       "lastPage": 1,
+       "link": true,
+       "type": "eq"
+    },
    {  "id": "decodeACSuccessive",
       "file": "pdfs/decodeACSuccessive.pdf",
       "md5": "7749c032624fe27ab8e8d7d5e9a4a93f",
				`@ -0,0 +1 @@`
				`https://github.com/mozilla/pdf.js/files/11537582/Pages.62.73.from.0560-22_WSP.Plan_July.2022_Version.1.pdf`