Improve "EI" detection in inline images (PR 12028 follow-up, issue 16454)
Given that inline images may contain "EI"-sequences in the image-data itself, actually finding the end-of-image operator isn't always straightforward. Here we extend the implementation from PR 12028 to potentially check all of the following bytes, rather than stopping immediately. While we have fairly decent test-coverage for this code, whenever you're changing it there's unfortunately a slightly higher than normal risk of regressions. (You'd really wish that PDF generators just stop using inline images.)
This commit is contained in:
parent
094fb3c783
commit
dfbbb8c0ac
@ -26,6 +26,7 @@ import {
|
||||
MissingDataException,
|
||||
ParserEOFException,
|
||||
} from "./core_utils.js";
|
||||
import { NullStream, Stream } from "./stream.js";
|
||||
import { Ascii85Stream } from "./ascii_85_stream.js";
|
||||
import { AsciiHexStream } from "./ascii_hex_stream.js";
|
||||
import { CCITTFaxStream } from "./ccitt_stream.js";
|
||||
@ -34,7 +35,6 @@ import { Jbig2Stream } from "./jbig2_stream.js";
|
||||
import { JpegStream } from "./jpeg_stream.js";
|
||||
import { JpxStream } from "./jpx_stream.js";
|
||||
import { LZWStream } from "./lzw_stream.js";
|
||||
import { NullStream } from "./stream.js";
|
||||
import { PredictorStream } from "./predictor_stream.js";
|
||||
import { RunLengthStream } from "./run_length_stream.js";
|
||||
|
||||
@ -190,9 +190,9 @@ class Parser {
|
||||
LF = 0xa,
|
||||
CR = 0xd,
|
||||
NUL = 0x0;
|
||||
const lexer = this.lexer,
|
||||
const { knownCommands } = this.lexer,
|
||||
startPos = stream.pos,
|
||||
n = 10;
|
||||
n = 15;
|
||||
let state = 0,
|
||||
ch,
|
||||
maybeEIPos;
|
||||
@ -209,7 +209,12 @@ class Parser {
|
||||
maybeEIPos = stream.pos;
|
||||
// Let's check that the next `n` bytes are ASCII... just to be sure.
|
||||
const followingBytes = stream.peekBytes(n);
|
||||
for (let i = 0, ii = followingBytes.length; i < ii; i++) {
|
||||
|
||||
const ii = followingBytes.length;
|
||||
if (ii === 0) {
|
||||
break; // The end of the stream was reached, nothing to check.
|
||||
}
|
||||
for (let i = 0; i < ii; i++) {
|
||||
ch = followingBytes[i];
|
||||
if (ch === NUL && followingBytes[i + 1] !== NUL) {
|
||||
// NUL bytes are not supposed to occur *outside* of inline
|
||||
@ -235,19 +240,47 @@ class Parser {
|
||||
if (state !== 2) {
|
||||
continue;
|
||||
}
|
||||
// Check that the "EI" sequence isn't part of the image data, since
|
||||
// that would cause the image to be truncated (fixes issue11124.pdf).
|
||||
if (lexer.knownCommands) {
|
||||
const nextObj = lexer.peekObj();
|
||||
if (nextObj instanceof Cmd && !lexer.knownCommands[nextObj.cmd]) {
|
||||
// Not a valid command, i.e. the inline image data *itself*
|
||||
// contains an "EI" sequence. Resetting the state.
|
||||
state = 0;
|
||||
}
|
||||
} else {
|
||||
if (!knownCommands) {
|
||||
warn(
|
||||
"findDefaultInlineStreamEnd - `lexer.knownCommands` is undefined."
|
||||
);
|
||||
continue;
|
||||
}
|
||||
// Check that the "EI" sequence isn't part of the image data, since
|
||||
// that would cause the image to be truncated (fixes issue11124.pdf).
|
||||
const tmpLexer = new Lexer(
|
||||
new Stream(followingBytes.slice()),
|
||||
knownCommands
|
||||
);
|
||||
// Reduce the number of (potential) warning messages.
|
||||
tmpLexer._hexStringWarn = () => {};
|
||||
let numArgs = 0;
|
||||
|
||||
while (true) {
|
||||
const nextObj = tmpLexer.getObj();
|
||||
|
||||
if (nextObj === EOF) {
|
||||
state = 0; // No valid command found, resetting the state.
|
||||
break;
|
||||
}
|
||||
if (nextObj instanceof Cmd) {
|
||||
const knownCommand = knownCommands[nextObj.cmd];
|
||||
if (!knownCommand) {
|
||||
// Not a valid command, i.e. the inline image data *itself*
|
||||
// contains an "EI" sequence. Resetting the state.
|
||||
state = 0;
|
||||
break;
|
||||
} else if (
|
||||
knownCommand.variableArgs
|
||||
? numArgs <= knownCommand.numArgs
|
||||
: numArgs === knownCommand.numArgs
|
||||
) {
|
||||
break; // Valid command found.
|
||||
}
|
||||
numArgs = 0;
|
||||
continue;
|
||||
}
|
||||
numArgs++;
|
||||
}
|
||||
|
||||
if (state === 2) {
|
||||
@ -1284,28 +1317,6 @@ class Lexer {
|
||||
return Cmd.get(str);
|
||||
}
|
||||
|
||||
peekObj() {
|
||||
const streamPos = this.stream.pos,
|
||||
currentChar = this.currentChar,
|
||||
beginInlineImagePos = this.beginInlineImagePos;
|
||||
|
||||
let nextObj;
|
||||
try {
|
||||
nextObj = this.getObj();
|
||||
} catch (ex) {
|
||||
if (ex instanceof MissingDataException) {
|
||||
throw ex;
|
||||
}
|
||||
warn(`peekObj: ${ex}`);
|
||||
}
|
||||
// Ensure that we reset *all* relevant `Lexer`-instance state.
|
||||
this.stream.pos = streamPos;
|
||||
this.currentChar = currentChar;
|
||||
this.beginInlineImagePos = beginInlineImagePos;
|
||||
|
||||
return nextObj;
|
||||
}
|
||||
|
||||
skipToNextLine() {
|
||||
let ch = this.currentChar;
|
||||
while (ch >= 0) {
|
||||
|
1
test/pdfs/issue16454.pdf.link
Normal file
1
test/pdfs/issue16454.pdf.link
Normal file
@ -0,0 +1 @@
|
||||
https://github.com/mozilla/pdf.js/files/11537582/Pages.62.73.from.0560-22_WSP.Plan_July.2022_Version.1.pdf
|
@ -5963,6 +5963,14 @@
|
||||
"link": true,
|
||||
"type": "eq"
|
||||
},
|
||||
{ "id": "issue16454",
|
||||
"file": "pdfs/issue16454.pdf",
|
||||
"md5": "82fe0c54a96667472ce999be7a789199",
|
||||
"rounds": 1,
|
||||
"lastPage": 1,
|
||||
"link": true,
|
||||
"type": "eq"
|
||||
},
|
||||
{ "id": "decodeACSuccessive",
|
||||
"file": "pdfs/decodeACSuccessive.pdf",
|
||||
"md5": "7749c032624fe27ab8e8d7d5e9a4a93f",
|
||||
|
Loading…
Reference in New Issue
Block a user