From dfbbb8c0ac4a9d5c56a537a0ce376efde218270c Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Tue, 23 May 2023 13:08:02 +0200 Subject: [PATCH] Improve "EI" detection in inline images (PR 12028 follow-up, issue 16454) Given that inline images may contain "EI"-sequences in the image-data itself, actually finding the end-of-image operator isn't always straightforward. Here we extend the implementation from PR 12028 to potentially check all of the following bytes, rather than stopping immediately. While we have fairly decent test-coverage for this code, whenever you're changing it there's unfortunately a slightly higher than normal risk of regressions. (You'd really wish that PDF generators just stop using inline images.) --- src/core/parser.js | 83 ++++++++++++++++++++--------------- test/pdfs/issue16454.pdf.link | 1 + test/test_manifest.json | 8 ++++ 3 files changed, 56 insertions(+), 36 deletions(-) create mode 100644 test/pdfs/issue16454.pdf.link diff --git a/src/core/parser.js b/src/core/parser.js index 371eed242..8a0c92e58 100644 --- a/src/core/parser.js +++ b/src/core/parser.js @@ -26,6 +26,7 @@ import { MissingDataException, ParserEOFException, } from "./core_utils.js"; +import { NullStream, Stream } from "./stream.js"; import { Ascii85Stream } from "./ascii_85_stream.js"; import { AsciiHexStream } from "./ascii_hex_stream.js"; import { CCITTFaxStream } from "./ccitt_stream.js"; @@ -34,7 +35,6 @@ import { Jbig2Stream } from "./jbig2_stream.js"; import { JpegStream } from "./jpeg_stream.js"; import { JpxStream } from "./jpx_stream.js"; import { LZWStream } from "./lzw_stream.js"; -import { NullStream } from "./stream.js"; import { PredictorStream } from "./predictor_stream.js"; import { RunLengthStream } from "./run_length_stream.js"; @@ -190,9 +190,9 @@ class Parser { LF = 0xa, CR = 0xd, NUL = 0x0; - const lexer = this.lexer, + const { knownCommands } = this.lexer, startPos = stream.pos, - n = 10; + n = 15; let state = 0, ch, maybeEIPos; @@ -209,7 +209,12 @@ class Parser { maybeEIPos = stream.pos; // Let's check that the next `n` bytes are ASCII... just to be sure. const followingBytes = stream.peekBytes(n); - for (let i = 0, ii = followingBytes.length; i < ii; i++) { + + const ii = followingBytes.length; + if (ii === 0) { + break; // The end of the stream was reached, nothing to check. + } + for (let i = 0; i < ii; i++) { ch = followingBytes[i]; if (ch === NUL && followingBytes[i + 1] !== NUL) { // NUL bytes are not supposed to occur *outside* of inline @@ -235,19 +240,47 @@ class Parser { if (state !== 2) { continue; } - // Check that the "EI" sequence isn't part of the image data, since - // that would cause the image to be truncated (fixes issue11124.pdf). - if (lexer.knownCommands) { - const nextObj = lexer.peekObj(); - if (nextObj instanceof Cmd && !lexer.knownCommands[nextObj.cmd]) { - // Not a valid command, i.e. the inline image data *itself* - // contains an "EI" sequence. Resetting the state. - state = 0; - } - } else { + if (!knownCommands) { warn( "findDefaultInlineStreamEnd - `lexer.knownCommands` is undefined." ); + continue; + } + // Check that the "EI" sequence isn't part of the image data, since + // that would cause the image to be truncated (fixes issue11124.pdf). + const tmpLexer = new Lexer( + new Stream(followingBytes.slice()), + knownCommands + ); + // Reduce the number of (potential) warning messages. + tmpLexer._hexStringWarn = () => {}; + let numArgs = 0; + + while (true) { + const nextObj = tmpLexer.getObj(); + + if (nextObj === EOF) { + state = 0; // No valid command found, resetting the state. + break; + } + if (nextObj instanceof Cmd) { + const knownCommand = knownCommands[nextObj.cmd]; + if (!knownCommand) { + // Not a valid command, i.e. the inline image data *itself* + // contains an "EI" sequence. Resetting the state. + state = 0; + break; + } else if ( + knownCommand.variableArgs + ? numArgs <= knownCommand.numArgs + : numArgs === knownCommand.numArgs + ) { + break; // Valid command found. + } + numArgs = 0; + continue; + } + numArgs++; } if (state === 2) { @@ -1284,28 +1317,6 @@ class Lexer { return Cmd.get(str); } - peekObj() { - const streamPos = this.stream.pos, - currentChar = this.currentChar, - beginInlineImagePos = this.beginInlineImagePos; - - let nextObj; - try { - nextObj = this.getObj(); - } catch (ex) { - if (ex instanceof MissingDataException) { - throw ex; - } - warn(`peekObj: ${ex}`); - } - // Ensure that we reset *all* relevant `Lexer`-instance state. - this.stream.pos = streamPos; - this.currentChar = currentChar; - this.beginInlineImagePos = beginInlineImagePos; - - return nextObj; - } - skipToNextLine() { let ch = this.currentChar; while (ch >= 0) { diff --git a/test/pdfs/issue16454.pdf.link b/test/pdfs/issue16454.pdf.link new file mode 100644 index 000000000..228782c9f --- /dev/null +++ b/test/pdfs/issue16454.pdf.link @@ -0,0 +1 @@ +https://github.com/mozilla/pdf.js/files/11537582/Pages.62.73.from.0560-22_WSP.Plan_July.2022_Version.1.pdf diff --git a/test/test_manifest.json b/test/test_manifest.json index f461a447a..bb17c1f18 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -5963,6 +5963,14 @@ "link": true, "type": "eq" }, + { "id": "issue16454", + "file": "pdfs/issue16454.pdf", + "md5": "82fe0c54a96667472ce999be7a789199", + "rounds": 1, + "lastPage": 1, + "link": true, + "type": "eq" + }, { "id": "decodeACSuccessive", "file": "pdfs/decodeACSuccessive.pdf", "md5": "7749c032624fe27ab8e8d7d5e9a4a93f",