[api-minor] Replace PDFDocumentProxy.getStats with a synchronous PDFDocumentProxy.stats getter

*Please note:* These changes will primarily benefit longer documents, somewhat at the expense of e.g. one-page documents.

The existing `PDFDocumentProxy.getStats` function, which in the default viewer is called for each rendered page, requires a round-trip to the worker-thread in order to obtain the current document stats. In the default viewer, we currently make one such API-call for *every rendered* page.
This patch proposes replacing that method with a *synchronous* `PDFDocumentProxy.stats` getter instead, combined with re-factoring the worker-thread code by adding a `DocStats`-class to track Stream/Font-types and *only send* them to the main-thread *the first time* that a type is encountered.

Note that in practice most PDF documents only use a fairly limited number of Stream/Font-types, which means that in longer documents most of the `PDFDocumentProxy.getStats`-calls will return the same data.[1]
This re-factoring will obviously benefit longer document the most[2], and could actually be seen as a regression for one-page documents, since in practice there'll usually be a couple of "DocStats" messages sent during the parsing of the first page. However, if the user zooms/rotates the document (which causes re-rendering), note that even a one-page document would start to benefit from these changes.

Another benefit of having the data available/cached in the API is that unless the document stats change during parsing, repeated `PDFDocumentProxy.stats`-calls will return *the same identical* object.
This is something that we can easily take advantage of in the default viewer, by now *only* reporting "documentStats" telemetry[3] when the data actually have changed rather than once per rendered page (again beneficial in longer documents).

---
[1] Furthermore, the maximium number of `StreamType`/`FontType` are `10` respectively `12`, which means that regardless of the complexity and page count in a PDF document there'll never be more than twenty-two "DocStats" messages sent; see 41ac3f0c07/src/shared/util.js (L206-L232)

[2] One example is the `pdf.pdf` document in the test-suite, where rendering all of its 1310 pages only result in a total of seven "DocStats" messages being sent from the worker-thread.

[3] Reporting telemetry, in Firefox, includes using `JSON.stringify` on the data and then sending an event to the `PdfStreamConverter.jsm`-code.
In that code the event is handled and `JSON.parse` is used to retrieve the data, and in the "documentStats"-case we'll then iterate through the data to avoid double-reporting telemetry; see https://searchfox.org/mozilla-central/rev/8f4c180b87e52f3345ef8a3432d6e54bd1eb18dc/toolkit/components/pdfjs/content/PdfStreamConverter.jsm#515-549
This commit is contained in:
Jonas Jenwald 2021-11-11 18:14:26 +01:00
parent 41ac3f0c07
commit 6da0944fc7
10 changed files with 158 additions and 67 deletions

View File

@ -16,7 +16,9 @@
import { import {
assert, assert,
BaseException, BaseException,
FontType,
objectSize, objectSize,
StreamType,
stringToPDFString, stringToPDFString,
warn, warn,
} from "../shared/util.js"; } from "../shared/util.js";
@ -76,6 +78,55 @@ class XRefParseException extends BaseException {
} }
} }
class DocStats {
constructor(handler) {
this._handler = handler;
this._streamTypes = new Set();
this._fontTypes = new Set();
}
_send() {
const streamTypes = Object.create(null),
fontTypes = Object.create(null);
for (const type of this._streamTypes) {
streamTypes[type] = true;
}
for (const type of this._fontTypes) {
fontTypes[type] = true;
}
this._handler.send("DocStats", { streamTypes, fontTypes });
}
addStreamType(type) {
if (
typeof PDFJSDev === "undefined" ||
PDFJSDev.test("!PRODUCTION || TESTING")
) {
assert(StreamType[type] === type, 'addStreamType: Invalid "type" value.');
}
if (this._streamTypes.has(type)) {
return;
}
this._streamTypes.add(type);
this._send();
}
addFontType(type) {
if (
typeof PDFJSDev === "undefined" ||
PDFJSDev.test("!PRODUCTION || TESTING")
) {
assert(FontType[type] === type, 'addFontType: Invalid "type" value.');
}
if (this._fontTypes.has(type)) {
return;
}
this._fontTypes.add(type);
this._send();
}
}
/** /**
* Get the value of an inheritable property. * Get the value of an inheritable property.
* *
@ -481,6 +532,7 @@ function recoverJsURL(str) {
export { export {
collectActions, collectActions,
DocStats,
encodeToXmlString, encodeToXmlString,
escapePDFName, escapePDFName,
getArrayLookupTableFactory, getArrayLookupTableFactory,

View File

@ -1248,8 +1248,7 @@ class PartialEvaluator {
this.translateFont(preEvaluatedFont) this.translateFont(preEvaluatedFont)
.then(translatedFont => { .then(translatedFont => {
if (translatedFont.fontType !== undefined) { if (translatedFont.fontType !== undefined) {
const xrefFontStats = xref.stats.fontTypes; xref.stats.addFontType(translatedFont.fontType);
xrefFontStats[translatedFont.fontType] = true;
} }
fontCapability.resolve( fontCapability.resolve(
@ -1277,8 +1276,9 @@ class PartialEvaluator {
preEvaluatedFont.type, preEvaluatedFont.type,
subtype && subtype.name subtype && subtype.name
); );
const xrefFontStats = xref.stats.fontTypes; if (fontType !== undefined) {
xrefFontStats[fontType] = true; xref.stats.addFontType(fontType);
}
} catch (ex) {} } catch (ex) {}
fontCapability.resolve( fontCapability.resolve(

View File

@ -741,13 +741,13 @@ class Parser {
warn(`Empty "${name}" stream.`); warn(`Empty "${name}" stream.`);
return new NullStream(); return new NullStream();
} }
const xrefStats = this.xref.stats;
try { try {
const xrefStreamStats = this.xref.stats.streamTypes;
switch (name) { switch (name) {
case "Fl": case "Fl":
case "FlateDecode": case "FlateDecode":
xrefStreamStats[StreamType.FLATE] = true; xrefStats.addStreamType(StreamType.FLATE);
if (params) { if (params) {
return new PredictorStream( return new PredictorStream(
new FlateStream(stream, maybeLength), new FlateStream(stream, maybeLength),
@ -758,7 +758,7 @@ class Parser {
return new FlateStream(stream, maybeLength); return new FlateStream(stream, maybeLength);
case "LZW": case "LZW":
case "LZWDecode": case "LZWDecode":
xrefStreamStats[StreamType.LZW] = true; xrefStats.addStreamType(StreamType.LZW);
let earlyChange = 1; let earlyChange = 1;
if (params) { if (params) {
if (params.has("EarlyChange")) { if (params.has("EarlyChange")) {
@ -773,30 +773,30 @@ class Parser {
return new LZWStream(stream, maybeLength, earlyChange); return new LZWStream(stream, maybeLength, earlyChange);
case "DCT": case "DCT":
case "DCTDecode": case "DCTDecode":
xrefStreamStats[StreamType.DCT] = true; xrefStats.addStreamType(StreamType.DCT);
return new JpegStream(stream, maybeLength, params); return new JpegStream(stream, maybeLength, params);
case "JPX": case "JPX":
case "JPXDecode": case "JPXDecode":
xrefStreamStats[StreamType.JPX] = true; xrefStats.addStreamType(StreamType.JPX);
return new JpxStream(stream, maybeLength, params); return new JpxStream(stream, maybeLength, params);
case "A85": case "A85":
case "ASCII85Decode": case "ASCII85Decode":
xrefStreamStats[StreamType.A85] = true; xrefStats.addStreamType(StreamType.A85);
return new Ascii85Stream(stream, maybeLength); return new Ascii85Stream(stream, maybeLength);
case "AHx": case "AHx":
case "ASCIIHexDecode": case "ASCIIHexDecode":
xrefStreamStats[StreamType.AHX] = true; xrefStats.addStreamType(StreamType.AHX);
return new AsciiHexStream(stream, maybeLength); return new AsciiHexStream(stream, maybeLength);
case "CCF": case "CCF":
case "CCITTFaxDecode": case "CCITTFaxDecode":
xrefStreamStats[StreamType.CCF] = true; xrefStats.addStreamType(StreamType.CCF);
return new CCITTFaxStream(stream, maybeLength, params); return new CCITTFaxStream(stream, maybeLength, params);
case "RL": case "RL":
case "RunLengthDecode": case "RunLengthDecode":
xrefStreamStats[StreamType.RLX] = true; xrefStats.addStreamType(StreamType.RLX);
return new RunLengthStream(stream, maybeLength); return new RunLengthStream(stream, maybeLength);
case "JBIG2Decode": case "JBIG2Decode":
xrefStreamStats[StreamType.JBIG] = true; xrefStats.addStreamType(StreamType.JBIG);
return new Jbig2Stream(stream, maybeLength, params); return new Jbig2Stream(stream, maybeLength, params);
} }
warn(`Filter "${name}" is not supported.`); warn(`Filter "${name}" is not supported.`);

View File

@ -115,12 +115,21 @@ class BasePdfManager {
} }
class LocalPdfManager extends BasePdfManager { class LocalPdfManager extends BasePdfManager {
constructor(docId, data, password, evaluatorOptions, enableXfa, docBaseUrl) { constructor(
docId,
data,
password,
msgHandler,
evaluatorOptions,
enableXfa,
docBaseUrl
) {
super(); super();
this._docId = docId; this._docId = docId;
this._password = password; this._password = password;
this._docBaseUrl = parseDocBaseUrl(docBaseUrl); this._docBaseUrl = parseDocBaseUrl(docBaseUrl);
this.msgHandler = msgHandler;
this.evaluatorOptions = evaluatorOptions; this.evaluatorOptions = evaluatorOptions;
this.enableXfa = enableXfa; this.enableXfa = enableXfa;

View File

@ -215,6 +215,7 @@ class WorkerMessageHandler {
docId, docId,
source.data, source.data,
source.password, source.password,
handler,
evaluatorOptions, evaluatorOptions,
enableXfa, enableXfa,
docBaseUrl docBaseUrl
@ -287,6 +288,7 @@ class WorkerMessageHandler {
docId, docId,
pdfFile, pdfFile,
source.password, source.password,
handler,
evaluatorOptions, evaluatorOptions,
enableXfa, enableXfa,
docBaseUrl docBaseUrl
@ -532,10 +534,6 @@ class WorkerMessageHandler {
}); });
}); });
handler.on("GetStats", function wphSetupGetStats(data) {
return pdfManager.ensureXRef("stats");
});
handler.on("GetAnnotations", function ({ pageIndex, intent }) { handler.on("GetAnnotations", function ({ pageIndex, intent }) {
return pdfManager.getPage(pageIndex).then(function (page) { return pdfManager.getPage(pageIndex).then(function (page) {
return page.getAnnotationsData(intent); return page.getAnnotationsData(intent);

View File

@ -30,13 +30,14 @@ import {
isStream, isStream,
Ref, Ref,
} from "./primitives.js"; } from "./primitives.js";
import { Lexer, Parser } from "./parser.js";
import { import {
DocStats,
MissingDataException, MissingDataException,
ParserEOFException, ParserEOFException,
XRefEntryException, XRefEntryException,
XRefParseException, XRefParseException,
} from "./core_utils.js"; } from "./core_utils.js";
import { Lexer, Parser } from "./parser.js";
import { CipherTransformFactory } from "./crypto.js"; import { CipherTransformFactory } from "./crypto.js";
class XRef { class XRef {
@ -46,10 +47,7 @@ class XRef {
this.entries = []; this.entries = [];
this.xrefstms = Object.create(null); this.xrefstms = Object.create(null);
this._cacheMap = new Map(); // Prepare the XRef cache. this._cacheMap = new Map(); // Prepare the XRef cache.
this.stats = { this.stats = new DocStats(pdfManager.msgHandler);
streamTypes: Object.create(null),
fontTypes: Object.create(null),
};
this._newRefNum = null; this._newRefNum = null;
} }

View File

@ -701,6 +701,16 @@ class PDFDocumentProxy {
return this.fingerprints[0]; return this.fingerprints[0];
}, },
}); });
Object.defineProperty(this, "getStats", {
value: async () => {
deprecated(
"`PDFDocumentProxy.getStats`, " +
"please use the `PDFDocumentProxy.stats`-getter instead."
);
return this.stats || { streamTypes: {}, fontTypes: {} };
},
});
} }
} }
@ -728,6 +738,24 @@ class PDFDocumentProxy {
return this._pdfInfo.fingerprints; return this._pdfInfo.fingerprints;
} }
/**
* @typedef {Object} PDFDocumentStats
* @property {Object<string, boolean>} streamTypes - Used stream types in the
* document (an item is set to true if specific stream ID was used in the
* document).
* @property {Object<string, boolean>} fontTypes - Used font types in the
* document (an item is set to true if specific font ID was used in the
* document).
*/
/**
* @type {PDFDocumentStats | null} The current statistics about document
* structures, or `null` when no statistics exists.
*/
get stats() {
return this._transport.stats;
}
/** /**
* @type {boolean} True if only XFA form. * @type {boolean} True if only XFA form.
*/ */
@ -940,25 +968,6 @@ class PDFDocumentProxy {
return this._transport.downloadInfoCapability.promise; return this._transport.downloadInfoCapability.promise;
} }
/**
* @typedef {Object} PDFDocumentStats
* @property {Object<string, boolean>} streamTypes - Used stream types in the
* document (an item is set to true if specific stream ID was used in the
* document).
* @property {Object<string, boolean>} fontTypes - Used font types in the
* document (an item is set to true if specific font ID was used in the
* document).
*/
/**
* @returns {Promise<PDFDocumentStats>} A promise this is resolved with
* current statistics about document structures (see
* {@link PDFDocumentStats}).
*/
getStats() {
return this._transport.getStats();
}
/** /**
* Cleans up resources allocated by the document on both the main and worker * Cleans up resources allocated by the document on both the main and worker
* threads. * threads.
@ -2392,6 +2401,8 @@ if (typeof PDFJSDev !== "undefined" && PDFJSDev.test("GENERIC")) {
* @ignore * @ignore
*/ */
class WorkerTransport { class WorkerTransport {
#docStats = null;
constructor(messageHandler, loadingTask, networkStream, params) { constructor(messageHandler, loadingTask, networkStream, params) {
this.messageHandler = messageHandler; this.messageHandler = messageHandler;
this.loadingTask = loadingTask; this.loadingTask = loadingTask;
@ -2433,6 +2444,10 @@ class WorkerTransport {
return shadow(this, "annotationStorage", new AnnotationStorage()); return shadow(this, "annotationStorage", new AnnotationStorage());
} }
get stats() {
return this.#docStats;
}
getRenderingIntent( getRenderingIntent(
intent, intent,
annotationMode = AnnotationMode.ENABLE, annotationMode = AnnotationMode.ENABLE,
@ -2843,6 +2858,18 @@ class WorkerTransport {
}); });
}); });
messageHandler.on("DocStats", data => {
if (this.destroyed) {
return; // Ignore any pending requests if the worker was terminated.
}
// Ensure that a `PDFDocumentProxy.stats` call-site cannot accidentally
// modify this internal data.
this.#docStats = Object.freeze({
streamTypes: Object.freeze(data.streamTypes),
fontTypes: Object.freeze(data.fontTypes),
});
});
messageHandler.on( messageHandler.on(
"UnsupportedFeature", "UnsupportedFeature",
this._onUnsupportedFeature.bind(this) this._onUnsupportedFeature.bind(this)
@ -3055,10 +3082,6 @@ class WorkerTransport {
return this.messageHandler.sendWithPromise("GetMarkInfo", null); return this.messageHandler.sendWithPromise("GetMarkInfo", null);
} }
getStats() {
return this.messageHandler.sendWithPromise("GetStats", null);
}
async startCleanup(keepLoadedFonts = false) { async startCleanup(keepLoadedFonts = false) {
await this.messageHandler.sendWithPromise("Cleanup", null); await this.messageHandler.sendWithPromise("Cleanup", null);

View File

@ -1275,8 +1275,8 @@ describe("api", function () {
}); });
it("gets document stats", async function () { it("gets document stats", async function () {
const stats = await pdfDocument.getStats(); const stats = pdfDocument.stats;
expect(stats).toEqual({ streamTypes: {}, fontTypes: {} }); expect(stats).toEqual(null);
}); });
it("cleans up document resources", async function () { it("cleans up document resources", async function () {
@ -2021,15 +2021,16 @@ sozialökonomische Gerechtigkeit.`)
}); });
it("gets document stats after parsing page", async function () { it("gets document stats after parsing page", async function () {
const stats = await page.getOperatorList().then(function () { await page.getOperatorList();
return pdfDocument.getStats(); const stats = pdfDocument.stats;
});
const expectedStreamTypes = {}; const expectedStreamTypes = {
expectedStreamTypes[StreamType.FLATE] = true; [StreamType.FLATE]: true,
const expectedFontTypes = {}; };
expectedFontTypes[FontType.TYPE1STANDARD] = true; const expectedFontTypes = {
expectedFontTypes[FontType.CIDFONTTYPE2] = true; [FontType.TYPE1STANDARD]: true,
[FontType.CIDFONTTYPE2]: true,
};
expect(stats).toEqual({ expect(stats).toEqual({
streamTypes: expectedStreamTypes, streamTypes: expectedStreamTypes,

View File

@ -16,6 +16,7 @@
import { isRef, Ref } from "../../src/core/primitives.js"; import { isRef, Ref } from "../../src/core/primitives.js";
import { Page, PDFDocument } from "../../src/core/document.js"; import { Page, PDFDocument } from "../../src/core/document.js";
import { assert } from "../../src/shared/util.js"; import { assert } from "../../src/shared/util.js";
import { DocStats } from "../../src/core/core_utils.js";
import { isNodeJS } from "../../src/shared/is_node.js"; import { isNodeJS } from "../../src/shared/is_node.js";
import { StringStream } from "../../src/core/stream.js"; import { StringStream } from "../../src/core/stream.js";
@ -76,10 +77,7 @@ function buildGetDocumentParams(filename, options) {
class XRefMock { class XRefMock {
constructor(array) { constructor(array) {
this._map = Object.create(null); this._map = Object.create(null);
this.stats = { this.stats = new DocStats({ send: () => {} });
streamTypes: Object.create(null),
fontTypes: Object.create(null),
};
this._newRefNum = null; this._newRefNum = null;
for (const key in array) { for (const key in array) {

View File

@ -257,6 +257,7 @@ const PDFViewerApplication = {
_contentDispositionFilename: null, _contentDispositionFilename: null,
_contentLength: null, _contentLength: null,
_saveInProgress: false, _saveInProgress: false,
_docStats: null,
_wheelUnusedTicks: 0, _wheelUnusedTicks: 0,
_idleCallbacks: new Set(), _idleCallbacks: new Set(),
@ -854,6 +855,7 @@ const PDFViewerApplication = {
this._contentDispositionFilename = null; this._contentDispositionFilename = null;
this._contentLength = null; this._contentLength = null;
this._saveInProgress = false; this._saveInProgress = false;
this._docStats = null;
this._cancelIdleCallbacks(); this._cancelIdleCallbacks();
promises.push(this.pdfScriptingManager.destroyPromise); promises.push(this.pdfScriptingManager.destroyPromise);
@ -2108,6 +2110,21 @@ const PDFViewerApplication = {
this._unblockDocumentLoadEvent = () => {}; this._unblockDocumentLoadEvent = () => {};
}, },
/**
* @ignore
*/
_reportDocumentStatsTelemetry() {
const { stats } = this.pdfDocument;
if (stats !== this._docStats) {
this._docStats = stats;
this.externalServices.reportTelemetry({
type: "documentStats",
stats,
});
}
},
/** /**
* Used together with the integration-tests, to enable awaiting full * Used together with the integration-tests, to enable awaiting full
* initialization of the scripting/sandbox. * initialization of the scripting/sandbox.
@ -2347,12 +2364,7 @@ function webViewerPageRendered({ pageNumber, error }) {
} }
// It is a good time to report stream and font types. // It is a good time to report stream and font types.
PDFViewerApplication.pdfDocument.getStats().then(function (stats) { PDFViewerApplication._reportDocumentStatsTelemetry();
PDFViewerApplication.externalServices.reportTelemetry({
type: "documentStats",
stats,
});
});
} }
function webViewerPageMode({ mode }) { function webViewerPageMode({ mode }) {