pdf.js/src/core/jpeg_stream.js
Jonas Jenwald 62a9c26cda Always prefer the PDF.js JPEG decoder for very large images, in order to reduce peak memory usage (issue 11694)
When JPEG images are decoded by the browser, on the main-thread, there's a handful of short-lived copies of the image data; see c3f4690bde/src/display/api.js (L2364-L2408)
That code thus becomes quite problematic for very big JPEG images, since it increases peak memory usage a lot during decoding. In the referenced issue there's a couple of JPEG images whose dimensions are `10006 x 7088` (i.e. ~68 mega-pixels), which causes the *peak* memory usage to increase by close to `1 GB` (i.e. one giga-byte) in my testing.

By letting the PDF.js JPEG decoder, rather than the browser, handle very large images the *peak* memory usage is considerably reduced and the allocated memory also seem to be reclaimed faster.

*Please note:* This will lead to movement in some existing `eq` tests.
2020-03-20 16:37:19 +01:00

260 lines
9.0 KiB
JavaScript

/* Copyright 2012 Mozilla Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { createObjectURL, shadow } from "../shared/util.js";
import { DecodeStream } from "./stream.js";
import { isDict } from "./primitives.js";
import { JpegImage } from "./jpg.js";
/**
* Depending on the type of JPEG a JpegStream is handled in different ways. For
* JPEG's that are supported natively such as DeviceGray and DeviceRGB the image
* data is stored and then loaded by the browser. For unsupported JPEG's we use
* a library to decode these images and the stream behaves like all the other
* DecodeStreams.
*/
const JpegStream = (function JpegStreamClosure() {
function JpegStream(stream, maybeLength, dict, params) {
// Some images may contain 'junk' before the SOI (start-of-image) marker.
// Note: this seems to mainly affect inline images.
let ch;
while ((ch = stream.getByte()) !== -1) {
// Find the first byte of the SOI marker (0xFFD8).
if (ch === 0xff) {
stream.skip(-1); // Reset the stream position to the SOI.
break;
}
}
this.stream = stream;
this.maybeLength = maybeLength;
this.dict = dict;
this.params = params;
DecodeStream.call(this, maybeLength);
}
JpegStream.prototype = Object.create(DecodeStream.prototype);
Object.defineProperty(JpegStream.prototype, "bytes", {
get: function JpegStream_bytes() {
// If `this.maybeLength` is null, we'll get the entire stream.
return shadow(this, "bytes", this.stream.getBytes(this.maybeLength));
},
configurable: true,
});
JpegStream.prototype.ensureBuffer = function(requested) {
// No-op, since `this.readBlock` will always parse the entire image and
// directly insert all of its data into `this.buffer`.
};
JpegStream.prototype.readBlock = function() {
if (this.eof) {
return;
}
const jpegOptions = {
decodeTransform: undefined,
colorTransform: undefined,
};
// Checking if values need to be transformed before conversion.
const decodeArr = this.dict.getArray("Decode", "D");
if (this.forceRGB && Array.isArray(decodeArr)) {
const bitsPerComponent = this.dict.get("BitsPerComponent") || 8;
const decodeArrLength = decodeArr.length;
const transform = new Int32Array(decodeArrLength);
let transformNeeded = false;
const maxValue = (1 << bitsPerComponent) - 1;
for (let i = 0; i < decodeArrLength; i += 2) {
transform[i] = ((decodeArr[i + 1] - decodeArr[i]) * 256) | 0;
transform[i + 1] = (decodeArr[i] * maxValue) | 0;
if (transform[i] !== 256 || transform[i + 1] !== 0) {
transformNeeded = true;
}
}
if (transformNeeded) {
jpegOptions.decodeTransform = transform;
}
}
// Fetching the 'ColorTransform' entry, if it exists.
if (isDict(this.params)) {
const colorTransform = this.params.get("ColorTransform");
if (Number.isInteger(colorTransform)) {
jpegOptions.colorTransform = colorTransform;
}
}
const jpegImage = new JpegImage(jpegOptions);
jpegImage.parse(this.bytes);
const data = jpegImage.getData({
width: this.drawWidth,
height: this.drawHeight,
forceRGB: this.forceRGB,
isSourcePDF: true,
});
this.buffer = data;
this.bufferLength = data.length;
this.eof = true;
};
Object.defineProperty(JpegStream.prototype, "maybeValidDimensions", {
get: function JpegStream_maybeValidDimensions() {
const { dict, stream } = this;
const dictHeight = dict.get("Height", "H");
const startPos = stream.pos;
let validDimensions = true,
foundSOF = false,
b;
while ((b = stream.getByte()) !== -1) {
if (b !== 0xff) {
// Not a valid marker.
continue;
}
switch (stream.getByte()) {
case 0xc0: // SOF0
case 0xc1: // SOF1
case 0xc2: // SOF2
// These three SOF{n} markers are the only ones that the built-in
// PDF.js JPEG decoder currently supports.
foundSOF = true;
stream.pos += 2; // Skip marker length.
stream.pos += 1; // Skip precision.
const scanLines = stream.getUint16();
const samplesPerLine = stream.getUint16();
// Letting the browser handle the JPEG decoding, on the main-thread,
// will cause a *large* increase in peak memory usage since there's
// a handful of short-lived copies of the image data. For very big
// JPEG images, always let the PDF.js image decoder handle them to
// reduce overall memory usage during decoding (see issue 11694).
if (scanLines * samplesPerLine > 1e6) {
validDimensions = false;
break;
}
// The "normal" case, where the image data and dictionary agrees.
if (scanLines === dictHeight) {
break;
}
// A DNL (Define Number of Lines) marker is expected,
// which browsers (usually) cannot decode natively.
if (scanLines === 0) {
validDimensions = false;
break;
}
// The dimensions of the image, among other properties, should
// always be taken from the image data *itself* rather than the
// XObject dictionary. However there's cases of corrupt images that
// browsers cannot decode natively, for example:
// - JPEG images with DNL markers, where the SOF `scanLines`
// parameter has an unexpected value (see issue 8614).
// - JPEG images with too large SOF `scanLines` parameter, where
// the EOI marker is encountered prematurely (see issue 10880).
// In an attempt to handle these kinds of corrupt images, compare
// the dimensions in the image data with the dictionary and *always*
// let the PDF.js JPEG decoder (rather than the browser) handle the
// image if the difference is larger than one order of magnitude
// (since that would generally suggest that something is off).
if (scanLines > dictHeight * 10) {
validDimensions = false;
break;
}
break;
case 0xc3: // SOF3
/* falls through */
case 0xc5: // SOF5
case 0xc6: // SOF6
case 0xc7: // SOF7
/* falls through */
case 0xc9: // SOF9
case 0xca: // SOF10
case 0xcb: // SOF11
/* falls through */
case 0xcd: // SOF13
case 0xce: // SOF14
case 0xcf: // SOF15
foundSOF = true;
break;
case 0xc4: // DHT
case 0xcc: // DAC
/* falls through */
case 0xda: // SOS
case 0xdb: // DQT
case 0xdc: // DNL
case 0xdd: // DRI
case 0xde: // DHP
case 0xdf: // EXP
/* falls through */
case 0xe0: // APP0
case 0xe1: // APP1
case 0xe2: // APP2
case 0xe3: // APP3
case 0xe4: // APP4
case 0xe5: // APP5
case 0xe6: // APP6
case 0xe7: // APP7
case 0xe8: // APP8
case 0xe9: // APP9
case 0xea: // APP10
case 0xeb: // APP11
case 0xec: // APP12
case 0xed: // APP13
case 0xee: // APP14
case 0xef: // APP15
/* falls through */
case 0xfe: // COM
const markerLength = stream.getUint16();
if (markerLength > 2) {
stream.skip(markerLength - 2); // Jump to the next marker.
} else {
// The marker length is invalid, resetting the stream position.
stream.skip(-2);
}
break;
case 0xff: // Fill byte.
// Avoid skipping a valid marker, resetting the stream position.
stream.skip(-1);
break;
case 0xd9: // EOI
foundSOF = true;
break;
}
if (foundSOF) {
break;
}
}
// Finally, don't forget to reset the stream position.
stream.pos = startPos;
return shadow(this, "maybeValidDimensions", validDimensions);
},
configurable: true,
});
JpegStream.prototype.getIR = function(forceDataSchema = false) {
return createObjectURL(this.bytes, "image/jpeg", forceDataSchema);
};
return JpegStream;
})();
export { JpegStream };