From 5494f7d5bc938229d7e5ae02adccc422e14ea4b4 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Thu, 9 Jan 2020 21:39:31 +0100 Subject: [PATCH 1/2] Add basic validation of the `scanLines` parameter in JPEG images, before delegating decoding to the browser In some cases PDF documents can contain JPEG images that the native browser decoder cannot handle, e.g. images with DNL (Define Number of Lines) markers or images where the SOF (Start of Frame) marker contains a wildly incorrect `scanLines` parameter. Currently, for "simple" JPEG images, we're relying on native image decoding to *fail* before falling back to the implementation in `src/core/jpg.js`. In some cases, note e.g. issue 10880, the native image decoder doesn't outright fail and thus some images may not render. In an attempt to improve the current situation, this patch adds additional validation of the JPEG image SOF data to force the use of `src/core/jpg.js` directly in cases where the native JPEG decoder cannot be trusted to do the right thing. The only way to implement this is unfortunately to parse the *beginning* of the JPEG image data, looking for a SOF marker. To limit the impact of this extra parsing, the result is cached on the `JpegStream` instance and this code is only run for images which passed all of the pre-existing "can the JPEG image be natively rendered and/or decoded" checks. --- *Slightly off-topic:* Working on this *really* makes me start questioning if native rendering/decoding of JPEG images is actually a good idea. There's certain kinds of JPEG images not supported natively, and all of the validation which is now necessary isn't "free". At this point, in the `NativeImageDecoder`, we're having to check for certain properties in the image dictionary, parse the `ColorSpace`, and finally read the actual image data to find the SOF marker. Furthermore, we cannot just send the image to the main-thread and be done in the "JpegStream" case, but we also need to wait for rendering to complete (or fail) before continuing with other parsing. In the "JpegDecode" case we're even having to parse part of the image on the main-thread, which seems completely at odds with the principle of doing all heavy parsing in the Worker, and there's also a couple of potentially large (temporary) allocations/copies of TypedArray data involved as well. --- src/core/evaluator.js | 3 +- src/core/image_utils.js | 3 +- src/core/jpeg_stream.js | 129 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 133 insertions(+), 2 deletions(-) diff --git a/src/core/evaluator.js b/src/core/evaluator.js index 508035797..ee1883524 100644 --- a/src/core/evaluator.js +++ b/src/core/evaluator.js @@ -512,7 +512,8 @@ var PartialEvaluator = (function PartialEvaluatorClosure() { this.xref, resources, this.pdfFunctionFactory - ) + ) && + image.maybeValidDimensions ) { // These JPEGs don't need any more processing so we can just send it. return this.handler diff --git a/src/core/image_utils.js b/src/core/image_utils.js index d12862ddf..294ec14a2 100644 --- a/src/core/image_utils.js +++ b/src/core/image_utils.js @@ -41,7 +41,8 @@ class NativeImageDecoder { this.xref, this.resources, this.pdfFunctionFactory - ) + ) && + image.maybeValidDimensions ); } diff --git a/src/core/jpeg_stream.js b/src/core/jpeg_stream.js index 582e194f3..068f75e6e 100644 --- a/src/core/jpeg_stream.js +++ b/src/core/jpeg_stream.js @@ -109,6 +109,135 @@ const JpegStream = (function JpegStreamClosure() { this.eof = true; }; + Object.defineProperty(JpegStream.prototype, "maybeValidDimensions", { + get: function JpegStream_maybeValidDimensions() { + const { dict, stream } = this; + const dictHeight = dict.get("Height", "H"); + const startPos = stream.pos; + + let validDimensions = true, + foundSOF = false, + b; + while ((b = stream.getByte()) !== -1) { + if (b !== 0xff) { + // Not a valid marker. + continue; + } + switch (stream.getByte()) { + case 0xc0: // SOF0 + case 0xc1: // SOF1 + case 0xc2: // SOF2 + // These three SOF{n} markers are the only ones that the built-in + // PDF.js JPEG decoder currently supports. + foundSOF = true; + + stream.pos += 2; // Skip marker length. + stream.pos += 1; // Skip precision. + const scanLines = stream.getUint16(); + + // The "normal" case, where the image data and dictionary agrees. + if (scanLines === dictHeight) { + break; + } + // A DNL (Define Number of Lines) marker is expected, + // which browsers (usually) cannot decode natively. + if (scanLines === 0) { + validDimensions = false; + break; + } + // The dimensions of the image, among other properties, should + // always be taken from the image data *itself* rather than the + // XObject dictionary. However there's cases of corrupt images that + // browsers cannot decode natively, for example: + // - JPEG images with DNL markers, where the SOF `scanLines` + // parameter has an unexpected value (see issue 8614). + // - JPEG images with too large SOF `scanLines` parameter, where + // the EOI marker is encountered prematurely (see issue 10880). + // In an attempt to handle these kinds of corrupt images, compare + // the dimensions in the image data with the dictionary and *always* + // let the PDF.js JPEG decoder (rather than the browser) handle the + // image if the difference is larger than one order of magnitude + // (since that would generally suggest that something is off). + if (scanLines > dictHeight * 10) { + validDimensions = false; + break; + } + break; + + case 0xc3: // SOF3 + /* falls through */ + case 0xc5: // SOF5 + case 0xc6: // SOF6 + case 0xc7: // SOF7 + /* falls through */ + case 0xc9: // SOF9 + case 0xca: // SOF10 + case 0xcb: // SOF11 + /* falls through */ + case 0xcd: // SOF13 + case 0xce: // SOF14 + case 0xcf: // SOF15 + foundSOF = true; + break; + + case 0xc4: // DHT + case 0xcc: // DAC + /* falls through */ + case 0xda: // SOS + case 0xdb: // DQT + case 0xdc: // DNL + case 0xdd: // DRI + case 0xde: // DHP + case 0xdf: // EXP + /* falls through */ + case 0xe0: // APP0 + case 0xe1: // APP1 + case 0xe2: // APP2 + case 0xe3: // APP3 + case 0xe4: // APP4 + case 0xe5: // APP5 + case 0xe6: // APP6 + case 0xe7: // APP7 + case 0xe8: // APP8 + case 0xe9: // APP9 + case 0xea: // APP10 + case 0xeb: // APP11 + case 0xec: // APP12 + case 0xed: // APP13 + case 0xee: // APP14 + case 0xef: // APP15 + /* falls through */ + case 0xfe: // COM + const markerLength = stream.getUint16(); + if (markerLength > 2) { + stream.skip(markerLength - 2); // Jump to the next marker. + } else { + // The marker length is invalid, resetting the stream position. + stream.skip(-2); + } + break; + + case 0xff: // Fill byte. + // Avoid skipping a valid marker, resetting the stream position. + stream.skip(-1); + break; + + case 0xd9: // EOI + foundSOF = true; + break; + } + if (foundSOF) { + break; + } + } + // Finally, don't forget to reset the stream position. + stream.pos = startPos; + + return shadow(this, "maybeValidDimensions", validDimensions); + }, + configurable: true, + }); + JpegStream.prototype.getIR = function(forceDataSchema = false) { return createObjectURL(this.bytes, "image/jpeg", forceDataSchema); }; From c3c3b8cd815edbb85524436c730ef339762209b1 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Sat, 18 Jan 2020 12:53:40 +0100 Subject: [PATCH 2/2] Add a heuristic, in `src/core/jpg.js`, to handle JPEG images with a wildly incorrect SOF (Start of Frame) `scanLines` parameter (issue 10880) *This whole patch feels somewhat arbitrary, and I'd be slightly worried about possibly breaking something else.* To limit the impact of these changes, we only re-parse JPEG images using a reduced `scanLines` value if and only if: An unexpected EOI (End of Image) marker was encountered during decoding of Scan data *and* the "actual" `scanLines` value is at least one order of magnitude smaller than expected. --- src/core/jpg.js | 24 ++++++++++++++++++++---- test/pdfs/issue10880.pdf.link | 1 + test/test_manifest.json | 9 +++++++++ 3 files changed, 30 insertions(+), 4 deletions(-) create mode 100644 test/pdfs/issue10880.pdf.link diff --git a/src/core/jpg.js b/src/core/jpg.js index c11321e78..b80739a7d 100644 --- a/src/core/jpg.js +++ b/src/core/jpg.js @@ -148,7 +148,7 @@ var JpegImage = (function JpegImageClosure() { if (bitsData === 0xff) { var nextByte = data[offset++]; if (nextByte) { - if (nextByte === 0xdc && parseDNLMarker) { + if (nextByte === /* DNL = */ 0xdc && parseDNLMarker) { offset += 2; // Skip marker length. const scanLines = readUint16(data, offset); @@ -159,7 +159,22 @@ var JpegImage = (function JpegImageClosure() { scanLines ); } - } else if (nextByte === 0xd9) { + } else if (nextByte === /* EOI = */ 0xd9) { + if (parseDNLMarker) { + // NOTE: only 8-bit JPEG images are supported in this decoder. + const maybeScanLines = blockRow * 8; + // Heuristic to attempt to handle corrupt JPEG images with too + // large `scanLines` parameter, by falling back to the currently + // parsed number of scanLines when it's at least one order of + // magnitude smaller than expected (fixes issue10880.pdf). + if (maybeScanLines > 0 && maybeScanLines < frame.scanLines / 10) { + throw new DNLMarkerError( + "Found EOI marker (0xFFD9) while parsing scan data, " + + "possibly caused by incorrect `scanLines` parameter", + maybeScanLines + ); + } + } throw new EOIMarkerError( "Found EOI marker (0xFFD9) while parsing scan data" ); @@ -337,17 +352,18 @@ var JpegImage = (function JpegImageClosure() { } } + let blockRow = 0; function decodeMcu(component, decode, mcu, row, col) { var mcuRow = (mcu / mcusPerLine) | 0; var mcuCol = mcu % mcusPerLine; - var blockRow = mcuRow * component.v + row; + blockRow = mcuRow * component.v + row; var blockCol = mcuCol * component.h + col; var offset = getBlockBufferOffset(component, blockRow, blockCol); decode(component, offset); } function decodeBlock(component, decode, mcu) { - var blockRow = (mcu / component.blocksPerLine) | 0; + blockRow = (mcu / component.blocksPerLine) | 0; var blockCol = mcu % component.blocksPerLine; var offset = getBlockBufferOffset(component, blockRow, blockCol); decode(component, offset); diff --git a/test/pdfs/issue10880.pdf.link b/test/pdfs/issue10880.pdf.link new file mode 100644 index 000000000..10f4e7b79 --- /dev/null +++ b/test/pdfs/issue10880.pdf.link @@ -0,0 +1 @@ +https://github.com/mozilla/pdf.js/files/3247065/B3-T-G5-50.pdf diff --git a/test/test_manifest.json b/test/test_manifest.json index d80a1dc25..baa8bece8 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -3614,6 +3614,15 @@ "lastPage": 1, "type": "eq" }, + { "id": "issue10880", + "file": "pdfs/issue10880.pdf", + "md5": "244ee5ee3ab88db8d8eb51d4416e2c97", + "rounds": 1, + "link": true, + "firstPage": 7, + "lastPage": 7, + "type": "eq" + }, { "id": "issue9650", "file": "pdfs/issue9650.pdf", "md5": "20d50bda6b1080b6d9088811299c791e",