Merge pull request #15920 from Snuffleupagus/transfer-pdf-data

[api-minor] Enable transferring of TypedArray PDF data by default (PR 15908 follow-up)
This commit is contained in:
Jonas Jenwald 2023-01-16 13:20:57 +01:00 committed by GitHub
commit 8f3fa18c93
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 97 additions and 126 deletions

View File

@ -139,8 +139,12 @@ if (typeof PDFJSDev === "undefined" || !PDFJSDev.test("PRODUCTION")) {
* @typedef {Object} DocumentInitParameters
* @property {string | URL} [url] - The URL of the PDF.
* @property {BinaryData} [data] - Binary PDF data.
* Use typed arrays (Uint8Array) to improve the memory usage. If PDF data is
* Use TypedArrays (Uint8Array) to improve the memory usage. If PDF data is
* BASE64-encoded, use `atob()` to convert it to a binary string first.
*
* NOTE: If TypedArrays are used they will generally be transferred to the
* worker-thread. This will help reduce main-thread memory usage, however
* it will take ownership of the TypedArrays.
* @property {Object} [httpHeaders] - Basic authentication headers.
* @property {boolean} [withCredentials] - Indicates whether or not
* cross-site Access-Control requests should be made using credentials such
@ -189,12 +193,6 @@ if (typeof PDFJSDev === "undefined" || !PDFJSDev.test("PRODUCTION")) {
* @property {number} [maxImageSize] - The maximum allowed image size in total
* pixels, i.e. width * height. Images above this value will not be rendered.
* Use -1 for no limit, which is also the default value.
* @property {boolean} [transferPdfData] - Determines if we can transfer
* TypedArrays used for loading the PDF file, utilized together with:
* - The `data`-option, for the `getDocument` function.
* - The `PDFDataTransportStream` implementation.
* This will help reduce main-thread memory usage, however it will take
* ownership of the TypedArrays. The default value is `false`.
* @property {boolean} [isEvalSupported] - Determines if we can evaluate strings
* as JavaScript. Primarily used to improve performance of font rendering, and
* when parsing PDF functions. The default value is `true`.
@ -281,20 +279,20 @@ function getDocument(src) {
worker = null;
for (const key in source) {
const value = source[key];
const val = source[key];
switch (key) {
case "url":
if (typeof window !== "undefined") {
try {
// The full path is required in the 'url' field.
params[key] = new URL(value, window.location).href;
params[key] = new URL(val, window.location).href;
continue;
} catch (ex) {
warn(`Cannot create valid URL: "${ex}".`);
}
} else if (typeof value === "string" || value instanceof URL) {
params[key] = value.toString(); // Support Node.js environments.
} else if (typeof val === "string" || val instanceof URL) {
params[key] = val.toString(); // Support Node.js environments.
continue;
}
throw new Error(
@ -302,10 +300,10 @@ function getDocument(src) {
"either string or URL-object is expected in the url property."
);
case "range":
rangeTransport = value;
rangeTransport = val;
continue;
case "worker":
worker = value;
worker = val;
continue;
case "data":
// Converting string or array-like data to Uint8Array.
@ -314,21 +312,24 @@ function getDocument(src) {
PDFJSDev.test("GENERIC") &&
isNodeJS &&
typeof Buffer !== "undefined" && // eslint-disable-line no-undef
value instanceof Buffer // eslint-disable-line no-undef
val instanceof Buffer // eslint-disable-line no-undef
) {
params[key] = new Uint8Array(value);
} else if (value instanceof Uint8Array) {
break; // Use the data as-is when it's already a Uint8Array.
} else if (typeof value === "string") {
params[key] = stringToBytes(value);
params[key] = new Uint8Array(val);
} else if (
typeof value === "object" &&
value !== null &&
!isNaN(value.length)
val instanceof Uint8Array &&
val.byteLength === val.buffer.byteLength
) {
params[key] = new Uint8Array(value);
} else if (isArrayBuffer(value)) {
params[key] = new Uint8Array(value);
// Use the data as-is when it's already a Uint8Array that completely
// "utilizes" its underlying ArrayBuffer, to prevent any possible
// issues when transferring it to the worker-thread.
break;
} else if (typeof val === "string") {
params[key] = stringToBytes(val);
} else if (
(typeof val === "object" && val !== null && !isNaN(val.length)) ||
isArrayBuffer(val)
) {
params[key] = new Uint8Array(val);
} else {
throw new Error(
"Invalid PDF binary data: either TypedArray, " +
@ -337,7 +338,7 @@ function getDocument(src) {
}
continue;
}
params[key] = value;
params[key] = val;
}
params.CMapReaderFactory =
@ -345,7 +346,6 @@ function getDocument(src) {
params.StandardFontDataFactory =
params.StandardFontDataFactory || DefaultStandardFontDataFactory;
params.ignoreErrors = params.stopAtErrors !== true;
params.transferPdfData = params.transferPdfData === true;
params.fontExtraProperties = params.fontExtraProperties === true;
params.pdfBug = params.pdfBug === true;
params.enableXfa = params.enableXfa === true;
@ -443,7 +443,6 @@ function getDocument(src) {
{
length: params.length,
initialData: params.initialData,
transferPdfData: params.transferPdfData,
progressiveDone: params.progressiveDone,
contentDispositionFilename: params.contentDispositionFilename,
disableRange: params.disableRange,
@ -518,8 +517,7 @@ async function _fetchDocument(worker, source, pdfDataRangeTransport, docId) {
source.contentDispositionFilename =
pdfDataRangeTransport.contentDispositionFilename;
}
const transfers =
source.transferPdfData && source.data ? [source.data.buffer] : null;
const transfers = source.data ? [source.data.buffer] : null;
const workerId = await worker.messageHandler.sendWithPromise(
"GetDocRequest",
@ -659,6 +657,10 @@ class PDFDocumentLoadingTask {
/**
* Abstract class to support range requests file loading.
*
* NOTE: The TypedArrays passed to the constructor and relevant methods below
* will generally be transferred to the worker-thread. This will help reduce
* main-thread memory usage, however it will take ownership of the TypedArrays.
*/
class PDFDataRangeTransport {
/**

View File

@ -18,13 +18,10 @@ import { isPdfFile } from "./display_utils.js";
/** @implements {IPDFStream} */
class PDFDataTransportStream {
#transferPdfData = false;
constructor(
{
length,
initialData,
transferPdfData = false,
progressiveDone = false,
contentDispositionFilename = null,
disableRange = false,
@ -38,14 +35,17 @@ class PDFDataTransportStream {
);
this._queuedChunks = [];
this.#transferPdfData = transferPdfData;
this._progressiveDone = progressiveDone;
this._contentDispositionFilename = contentDispositionFilename;
if (initialData?.length > 0) {
const buffer = this.#transferPdfData
? initialData.buffer
: new Uint8Array(initialData).buffer;
// Prevent any possible issues by only transferring a Uint8Array that
// completely "utilizes" its underlying ArrayBuffer.
const buffer =
initialData instanceof Uint8Array &&
initialData.byteLength === initialData.buffer.byteLength
? initialData.buffer
: new Uint8Array(initialData).buffer;
this._queuedChunks.push(buffer);
}
@ -77,8 +77,11 @@ class PDFDataTransportStream {
}
_onReceiveData({ begin, chunk }) {
// Prevent any possible issues by only transferring a Uint8Array that
// completely "utilizes" its underlying ArrayBuffer.
const buffer =
this.#transferPdfData && chunk?.length >= 0
chunk instanceof Uint8Array &&
chunk.byteLength === chunk.buffer.byteLength
? chunk.buffer
: new Uint8Array(chunk).buffer;

View File

@ -193,44 +193,10 @@ describe("api", function () {
expect(data[0] instanceof PDFDocumentProxy).toEqual(true);
expect(data[1].loaded / data[1].total).toEqual(1);
// Check that the TypedArray wasn't transferred.
expect(typedArrayPdf.length).toEqual(basicApiFileLength);
await loadingTask.destroy();
});
it("creates pdf doc from TypedArray, with `transferPdfData` set", async function () {
if (isNodeJS) {
pending("Worker is not supported in Node.js.");
if (!isNodeJS) {
// Check that the TypedArray was transferred.
expect(typedArrayPdf.length).toEqual(0);
}
const typedArrayPdf = await DefaultFileReaderFactory.fetch({
path: TEST_PDFS_PATH + basicApiFileName,
});
// Sanity check to make sure that we fetched the entire PDF file.
expect(typedArrayPdf instanceof Uint8Array).toEqual(true);
expect(typedArrayPdf.length).toEqual(basicApiFileLength);
const loadingTask = getDocument({
data: typedArrayPdf,
transferPdfData: true,
});
expect(loadingTask instanceof PDFDocumentLoadingTask).toEqual(true);
const progressReportedCapability = createPromiseCapability();
loadingTask.onProgress = function (data) {
progressReportedCapability.resolve(data);
};
const data = await Promise.all([
loadingTask.promise,
progressReportedCapability.promise,
]);
expect(data[0] instanceof PDFDocumentProxy).toEqual(true);
expect(data[1].loaded / data[1].total).toEqual(1);
// Check that the TypedArray was transferred.
expect(typedArrayPdf.length).toEqual(0);
await loadingTask.destroy();
});
@ -259,6 +225,11 @@ describe("api", function () {
expect(data[0] instanceof PDFDocumentProxy).toEqual(true);
expect(data[1].loaded / data[1].total).toEqual(1);
if (!isNodeJS) {
// Check that the ArrayBuffer was transferred.
expect(arrayBufferPdf.byteLength).toEqual(0);
}
await loadingTask.destroy();
});
@ -3275,16 +3246,22 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
it("should fetch document info and page using ranges", async function () {
const initialDataLength = 4000;
const subArrays = [];
let fetches = 0;
const data = await dataPromise;
const initialData = data.subarray(0, initialDataLength);
const initialData = new Uint8Array(data.subarray(0, initialDataLength));
subArrays.push(initialData);
const transport = new PDFDataRangeTransport(data.length, initialData);
transport.requestDataRange = function (begin, end) {
fetches++;
waitSome(function () {
transport.onDataProgress(4000);
transport.onDataRange(begin, data.subarray(begin, end));
const chunk = new Uint8Array(data.subarray(begin, end));
subArrays.push(chunk);
transport.onDataProgress(initialDataLength);
transport.onDataRange(begin, chunk);
});
};
@ -3296,65 +3273,40 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
expect(pdfPage.rotate).toEqual(0);
expect(fetches).toBeGreaterThan(2);
// Check that the TypedArray wasn't transferred.
expect(initialData.length).toEqual(initialDataLength);
await loadingTask.destroy();
});
it("should fetch document info and page using ranges, with `transferPdfData` set", async function () {
if (isNodeJS) {
pending("Worker is not supported in Node.js.");
if (!isNodeJS) {
// Check that the TypedArrays were transferred.
for (const array of subArrays) {
expect(array.length).toEqual(0);
}
}
const initialDataLength = 4000;
let fetches = 0;
const data = await dataPromise;
const initialData = new Uint8Array(data.subarray(0, initialDataLength));
const transport = new PDFDataRangeTransport(data.length, initialData);
transport.requestDataRange = function (begin, end) {
fetches++;
waitSome(function () {
transport.onDataProgress(4000);
transport.onDataRange(
begin,
new Uint8Array(data.subarray(begin, end))
);
});
};
const loadingTask = getDocument({
range: transport,
transferPdfData: true,
});
const pdfDocument = await loadingTask.promise;
expect(pdfDocument.numPages).toEqual(14);
const pdfPage = await pdfDocument.getPage(10);
expect(pdfPage.rotate).toEqual(0);
expect(fetches).toBeGreaterThan(2);
// Check that the TypedArray was transferred.
expect(initialData.length).toEqual(0);
await loadingTask.destroy();
});
it("should fetch document info and page using range and streaming", async function () {
const initialDataLength = 4000;
const subArrays = [];
let fetches = 0;
const data = await dataPromise;
const initialData = data.subarray(0, initialDataLength);
const initialData = new Uint8Array(data.subarray(0, initialDataLength));
subArrays.push(initialData);
const transport = new PDFDataRangeTransport(data.length, initialData);
transport.requestDataRange = function (begin, end) {
fetches++;
if (fetches === 1) {
const chunk = new Uint8Array(data.subarray(initialDataLength));
subArrays.push(chunk);
// Send rest of the data on first range request.
transport.onDataProgressiveRead(data.subarray(initialDataLength));
transport.onDataProgressiveRead(chunk);
}
waitSome(function () {
transport.onDataRange(begin, data.subarray(begin, end));
const chunk = new Uint8Array(data.subarray(begin, end));
subArrays.push(chunk);
transport.onDataRange(begin, chunk);
});
};
@ -3369,6 +3321,14 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
await new Promise(resolve => {
waitSome(resolve);
});
if (!isNodeJS) {
// Check that the TypedArrays were transferred.
for (const array of subArrays) {
expect(array.length).toEqual(0);
}
}
await loadingTask.destroy();
});
@ -3376,12 +3336,16 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
"should fetch document info and page, without range, " +
"using complete initialData",
async function () {
const subArrays = [];
let fetches = 0;
const data = await dataPromise;
const initialData = new Uint8Array(data);
subArrays.push(initialData);
const transport = new PDFDataRangeTransport(
data.length,
data,
initialData,
/* progressiveDone = */ true
);
transport.requestDataRange = function (begin, end) {
@ -3399,6 +3363,13 @@ Caron Broadcasting, Inc., an Ohio corporation (“Lessee”).`)
expect(pdfPage.rotate).toEqual(0);
expect(fetches).toEqual(0);
if (!isNodeJS) {
// Check that the TypedArrays were transferred.
for (const array of subArrays) {
expect(array.length).toEqual(0);
}
}
await loadingTask.destroy();
}
);

View File

@ -270,11 +270,6 @@ const defaultOptions = {
: "../web/standard_fonts/",
kind: OptionKind.API,
},
transferPdfData: {
/** @type {boolean} */
value: typeof PDFJSDev !== "undefined" && PDFJSDev.test("MOZCENTRAL"),
kind: OptionKind.API,
},
verbosity: {
/** @type {number} */
value: 1,