From 99cfab18c10a77f4c2f5fe17eb719ea67df82e61 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Fri, 13 Jan 2023 11:16:19 +0100 Subject: [PATCH 1/2] Combine the array-like and ArrayBuffer branches, when handling binary data, in `getDocument` --- src/display/api.js | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/src/display/api.js b/src/display/api.js index cc8357c6d..2e84f4c64 100644 --- a/src/display/api.js +++ b/src/display/api.js @@ -281,20 +281,20 @@ function getDocument(src) { worker = null; for (const key in source) { - const value = source[key]; + const val = source[key]; switch (key) { case "url": if (typeof window !== "undefined") { try { // The full path is required in the 'url' field. - params[key] = new URL(value, window.location).href; + params[key] = new URL(val, window.location).href; continue; } catch (ex) { warn(`Cannot create valid URL: "${ex}".`); } - } else if (typeof value === "string" || value instanceof URL) { - params[key] = value.toString(); // Support Node.js environments. + } else if (typeof val === "string" || val instanceof URL) { + params[key] = val.toString(); // Support Node.js environments. continue; } throw new Error( @@ -302,10 +302,10 @@ function getDocument(src) { "either string or URL-object is expected in the url property." ); case "range": - rangeTransport = value; + rangeTransport = val; continue; case "worker": - worker = value; + worker = val; continue; case "data": // Converting string or array-like data to Uint8Array. @@ -314,21 +314,18 @@ function getDocument(src) { PDFJSDev.test("GENERIC") && isNodeJS && typeof Buffer !== "undefined" && // eslint-disable-line no-undef - value instanceof Buffer // eslint-disable-line no-undef + val instanceof Buffer // eslint-disable-line no-undef ) { - params[key] = new Uint8Array(value); - } else if (value instanceof Uint8Array) { + params[key] = new Uint8Array(val); + } else if (val instanceof Uint8Array) { break; // Use the data as-is when it's already a Uint8Array. - } else if (typeof value === "string") { - params[key] = stringToBytes(value); + } else if (typeof val === "string") { + params[key] = stringToBytes(val); } else if ( - typeof value === "object" && - value !== null && - !isNaN(value.length) + (typeof val === "object" && val !== null && !isNaN(val.length)) || + isArrayBuffer(val) ) { - params[key] = new Uint8Array(value); - } else if (isArrayBuffer(value)) { - params[key] = new Uint8Array(value); + params[key] = new Uint8Array(val); } else { throw new Error( "Invalid PDF binary data: either TypedArray, " + @@ -337,7 +334,7 @@ function getDocument(src) { } continue; } - params[key] = value; + params[key] = val; } params.CMapReaderFactory = From 397f943ca327dadade6af2eb58685331d3c03065 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Fri, 13 Jan 2023 11:16:28 +0100 Subject: [PATCH 2/2] [api-minor] Enable transferring of TypedArray PDF data by default (PR 15908 follow-up) This patch removes the recently introduced `transferPdfData` API-option, and simply enables transferring of TypedArray data *by default* instead of copying it. This will help reduce main-thread memory usage, however it will take ownership of the TypedArrays. Currently this only applies to the following cases: - TypedArrays passed to the `getDocument`-function in the API, in order to open PDF documents from binary data. - TypedArrays passed to a `PDFDataRangeTransport`-instance, used to support custom PDF document fetching/loading (see e.g. the Firefox PDF Viewer). *PLEASE NOTE:* To avoid being affected by this, please simply *copy* any TypedArray data before passing it to either of the functions/methods mentioned above. Now that we transfer TypedArray data that we previously only copied, we need to be more careful with input validation. Given how the `{IPDFStreamReader, IPDFStreamRangeReader}.read` methods will always return ArrayBuffer data, which is then transferred to the worker-thread[1], the actual TypedArray data passed to the API thus need to have the same exact size as its underlying ArrayBuffer to prevent issues. Hence we'll check for this and only allow transferring of *safe* TypedArray data, and fallback to simply copying the data just as before. This obviously shouldn't be an issue in the Firefox PDF Viewer, but for the general PDF.js library we need to be more careful here. --- [1] See https://github.com/mozilla/pdf.js/blob/e09ad99973b1dcb82a06c001da96d52fc5bcab9d/src/display/api.js#L2492-L2506 respectively https://github.com/mozilla/pdf.js/blob/e09ad99973b1dcb82a06c001da96d52fc5bcab9d/src/display/api.js#L2578-L2590 --- src/display/api.js | 31 +++++--- src/display/transport_stream.js | 19 +++-- test/unit/api_spec.js | 137 +++++++++++++------------------- web/app_options.js | 5 -- 4 files changed, 83 insertions(+), 109 deletions(-) diff --git a/src/display/api.js b/src/display/api.js index 2e84f4c64..a90e8bcd6 100644 --- a/src/display/api.js +++ b/src/display/api.js @@ -139,8 +139,12 @@ if (typeof PDFJSDev === "undefined" || !PDFJSDev.test("PRODUCTION")) { * @typedef {Object} DocumentInitParameters * @property {string | URL} [url] - The URL of the PDF. * @property {BinaryData} [data] - Binary PDF data. - * Use typed arrays (Uint8Array) to improve the memory usage. If PDF data is + * Use TypedArrays (Uint8Array) to improve the memory usage. If PDF data is * BASE64-encoded, use `atob()` to convert it to a binary string first. + * + * NOTE: If TypedArrays are used they will generally be transferred to the + * worker-thread. This will help reduce main-thread memory usage, however + * it will take ownership of the TypedArrays. * @property {Object} [httpHeaders] - Basic authentication headers. * @property {boolean} [withCredentials] - Indicates whether or not * cross-site Access-Control requests should be made using credentials such @@ -189,12 +193,6 @@ if (typeof PDFJSDev === "undefined" || !PDFJSDev.test("PRODUCTION")) { * @property {number} [maxImageSize] - The maximum allowed image size in total * pixels, i.e. width * height. Images above this value will not be rendered. * Use -1 for no limit, which is also the default value. - * @property {boolean} [transferPdfData] - Determines if we can transfer - * TypedArrays used for loading the PDF file, utilized together with: - * - The `data`-option, for the `getDocument` function. - * - The `PDFDataTransportStream` implementation. - * This will help reduce main-thread memory usage, however it will take - * ownership of the TypedArrays. The default value is `false`. * @property {boolean} [isEvalSupported] - Determines if we can evaluate strings * as JavaScript. Primarily used to improve performance of font rendering, and * when parsing PDF functions. The default value is `true`. @@ -317,8 +315,14 @@ function getDocument(src) { val instanceof Buffer // eslint-disable-line no-undef ) { params[key] = new Uint8Array(val); - } else if (val instanceof Uint8Array) { - break; // Use the data as-is when it's already a Uint8Array. + } else if ( + val instanceof Uint8Array && + val.byteLength === val.buffer.byteLength + ) { + // Use the data as-is when it's already a Uint8Array that completely + // "utilizes" its underlying ArrayBuffer, to prevent any possible + // issues when transferring it to the worker-thread. + break; } else if (typeof val === "string") { params[key] = stringToBytes(val); } else if ( @@ -342,7 +346,6 @@ function getDocument(src) { params.StandardFontDataFactory = params.StandardFontDataFactory || DefaultStandardFontDataFactory; params.ignoreErrors = params.stopAtErrors !== true; - params.transferPdfData = params.transferPdfData === true; params.fontExtraProperties = params.fontExtraProperties === true; params.pdfBug = params.pdfBug === true; params.enableXfa = params.enableXfa === true; @@ -440,7 +443,6 @@ function getDocument(src) { { length: params.length, initialData: params.initialData, - transferPdfData: params.transferPdfData, progressiveDone: params.progressiveDone, contentDispositionFilename: params.contentDispositionFilename, disableRange: params.disableRange, @@ -515,8 +517,7 @@ async function _fetchDocument(worker, source, pdfDataRangeTransport, docId) { source.contentDispositionFilename = pdfDataRangeTransport.contentDispositionFilename; } - const transfers = - source.transferPdfData && source.data ? [source.data.buffer] : null; + const transfers = source.data ? [source.data.buffer] : null; const workerId = await worker.messageHandler.sendWithPromise( "GetDocRequest", @@ -656,6 +657,10 @@ class PDFDocumentLoadingTask { /** * Abstract class to support range requests file loading. + * + * NOTE: The TypedArrays passed to the constructor and relevant methods below + * will generally be transferred to the worker-thread. This will help reduce + * main-thread memory usage, however it will take ownership of the TypedArrays. */ class PDFDataRangeTransport { /** diff --git a/src/display/transport_stream.js b/src/display/transport_stream.js index 38460d55d..b27dbaf7d 100644 --- a/src/display/transport_stream.js +++ b/src/display/transport_stream.js @@ -18,13 +18,10 @@ import { isPdfFile } from "./display_utils.js"; /** @implements {IPDFStream} */ class PDFDataTransportStream { - #transferPdfData = false; - constructor( { length, initialData, - transferPdfData = false, progressiveDone = false, contentDispositionFilename = null, disableRange = false, @@ -38,14 +35,17 @@ class PDFDataTransportStream { ); this._queuedChunks = []; - this.#transferPdfData = transferPdfData; this._progressiveDone = progressiveDone; this._contentDispositionFilename = contentDispositionFilename; if (initialData?.length > 0) { - const buffer = this.#transferPdfData - ? initialData.buffer - : new Uint8Array(initialData).buffer; + // Prevent any possible issues by only transferring a Uint8Array that + // completely "utilizes" its underlying ArrayBuffer. + const buffer = + initialData instanceof Uint8Array && + initialData.byteLength === initialData.buffer.byteLength + ? initialData.buffer + : new Uint8Array(initialData).buffer; this._queuedChunks.push(buffer); } @@ -77,8 +77,11 @@ class PDFDataTransportStream { } _onReceiveData({ begin, chunk }) { + // Prevent any possible issues by only transferring a Uint8Array that + // completely "utilizes" its underlying ArrayBuffer. const buffer = - this.#transferPdfData && chunk?.length >= 0 + chunk instanceof Uint8Array && + chunk.byteLength === chunk.buffer.byteLength ? chunk.buffer : new Uint8Array(chunk).buffer; diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 53cb12de9..0d373d18f 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -193,44 +193,10 @@ describe("api", function () { expect(data[0] instanceof PDFDocumentProxy).toEqual(true); expect(data[1].loaded / data[1].total).toEqual(1); - // Check that the TypedArray wasn't transferred. - expect(typedArrayPdf.length).toEqual(basicApiFileLength); - - await loadingTask.destroy(); - }); - - it("creates pdf doc from TypedArray, with `transferPdfData` set", async function () { - if (isNodeJS) { - pending("Worker is not supported in Node.js."); + if (!isNodeJS) { + // Check that the TypedArray was transferred. + expect(typedArrayPdf.length).toEqual(0); } - const typedArrayPdf = await DefaultFileReaderFactory.fetch({ - path: TEST_PDFS_PATH + basicApiFileName, - }); - - // Sanity check to make sure that we fetched the entire PDF file. - expect(typedArrayPdf instanceof Uint8Array).toEqual(true); - expect(typedArrayPdf.length).toEqual(basicApiFileLength); - - const loadingTask = getDocument({ - data: typedArrayPdf, - transferPdfData: true, - }); - expect(loadingTask instanceof PDFDocumentLoadingTask).toEqual(true); - - const progressReportedCapability = createPromiseCapability(); - loadingTask.onProgress = function (data) { - progressReportedCapability.resolve(data); - }; - - const data = await Promise.all([ - loadingTask.promise, - progressReportedCapability.promise, - ]); - expect(data[0] instanceof PDFDocumentProxy).toEqual(true); - expect(data[1].loaded / data[1].total).toEqual(1); - - // Check that the TypedArray was transferred. - expect(typedArrayPdf.length).toEqual(0); await loadingTask.destroy(); }); @@ -259,6 +225,11 @@ describe("api", function () { expect(data[0] instanceof PDFDocumentProxy).toEqual(true); expect(data[1].loaded / data[1].total).toEqual(1); + if (!isNodeJS) { + // Check that the ArrayBuffer was transferred. + expect(arrayBufferPdf.byteLength).toEqual(0); + } + await loadingTask.destroy(); }); @@ -3275,16 +3246,22 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`) it("should fetch document info and page using ranges", async function () { const initialDataLength = 4000; + const subArrays = []; let fetches = 0; const data = await dataPromise; - const initialData = data.subarray(0, initialDataLength); + const initialData = new Uint8Array(data.subarray(0, initialDataLength)); + subArrays.push(initialData); + const transport = new PDFDataRangeTransport(data.length, initialData); transport.requestDataRange = function (begin, end) { fetches++; waitSome(function () { - transport.onDataProgress(4000); - transport.onDataRange(begin, data.subarray(begin, end)); + const chunk = new Uint8Array(data.subarray(begin, end)); + subArrays.push(chunk); + + transport.onDataProgress(initialDataLength); + transport.onDataRange(begin, chunk); }); }; @@ -3296,65 +3273,40 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`) expect(pdfPage.rotate).toEqual(0); expect(fetches).toBeGreaterThan(2); - // Check that the TypedArray wasn't transferred. - expect(initialData.length).toEqual(initialDataLength); - - await loadingTask.destroy(); - }); - - it("should fetch document info and page using ranges, with `transferPdfData` set", async function () { - if (isNodeJS) { - pending("Worker is not supported in Node.js."); + if (!isNodeJS) { + // Check that the TypedArrays were transferred. + for (const array of subArrays) { + expect(array.length).toEqual(0); + } } - const initialDataLength = 4000; - let fetches = 0; - - const data = await dataPromise; - const initialData = new Uint8Array(data.subarray(0, initialDataLength)); - const transport = new PDFDataRangeTransport(data.length, initialData); - transport.requestDataRange = function (begin, end) { - fetches++; - waitSome(function () { - transport.onDataProgress(4000); - transport.onDataRange( - begin, - new Uint8Array(data.subarray(begin, end)) - ); - }); - }; - - const loadingTask = getDocument({ - range: transport, - transferPdfData: true, - }); - const pdfDocument = await loadingTask.promise; - expect(pdfDocument.numPages).toEqual(14); - - const pdfPage = await pdfDocument.getPage(10); - expect(pdfPage.rotate).toEqual(0); - expect(fetches).toBeGreaterThan(2); - - // Check that the TypedArray was transferred. - expect(initialData.length).toEqual(0); await loadingTask.destroy(); }); it("should fetch document info and page using range and streaming", async function () { const initialDataLength = 4000; + const subArrays = []; let fetches = 0; const data = await dataPromise; - const initialData = data.subarray(0, initialDataLength); + const initialData = new Uint8Array(data.subarray(0, initialDataLength)); + subArrays.push(initialData); + const transport = new PDFDataRangeTransport(data.length, initialData); transport.requestDataRange = function (begin, end) { fetches++; if (fetches === 1) { + const chunk = new Uint8Array(data.subarray(initialDataLength)); + subArrays.push(chunk); + // Send rest of the data on first range request. - transport.onDataProgressiveRead(data.subarray(initialDataLength)); + transport.onDataProgressiveRead(chunk); } waitSome(function () { - transport.onDataRange(begin, data.subarray(begin, end)); + const chunk = new Uint8Array(data.subarray(begin, end)); + subArrays.push(chunk); + + transport.onDataRange(begin, chunk); }); }; @@ -3369,6 +3321,14 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`) await new Promise(resolve => { waitSome(resolve); }); + + if (!isNodeJS) { + // Check that the TypedArrays were transferred. + for (const array of subArrays) { + expect(array.length).toEqual(0); + } + } + await loadingTask.destroy(); }); @@ -3376,12 +3336,16 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`) "should fetch document info and page, without range, " + "using complete initialData", async function () { + const subArrays = []; let fetches = 0; const data = await dataPromise; + const initialData = new Uint8Array(data); + subArrays.push(initialData); + const transport = new PDFDataRangeTransport( data.length, - data, + initialData, /* progressiveDone = */ true ); transport.requestDataRange = function (begin, end) { @@ -3399,6 +3363,13 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`) expect(pdfPage.rotate).toEqual(0); expect(fetches).toEqual(0); + if (!isNodeJS) { + // Check that the TypedArrays were transferred. + for (const array of subArrays) { + expect(array.length).toEqual(0); + } + } + await loadingTask.destroy(); } ); diff --git a/web/app_options.js b/web/app_options.js index bc98b00db..d0cafe242 100644 --- a/web/app_options.js +++ b/web/app_options.js @@ -270,11 +270,6 @@ const defaultOptions = { : "../web/standard_fonts/", kind: OptionKind.API, }, - transferPdfData: { - /** @type {boolean} */ - value: typeof PDFJSDev !== "undefined" && PDFJSDev.test("MOZCENTRAL"), - kind: OptionKind.API, - }, verbosity: { /** @type {number} */ value: 1,