From 3351d3476d1f3bf6ff9098d267fbba0ff4f3257a Mon Sep 17 00:00:00 2001 From: Jonas Jenwald <jonas.jenwald@gmail.com> Date: Fri, 16 Oct 2020 12:20:44 +0200 Subject: [PATCH 1/2] Don't store complex data in `PDFDocument.formInfo`, and replace the `fields` object with a `hasFields` boolean instead *This patch is based on a couple of smaller things that I noticed when working on PR 12479.* - Don't store the /Fields on the `formInfo` getter, since that feels like overloading it with unintended (and too complex) data, and utilize a `hasFields` boolean instead. This functionality was originally added in PR 12271, to help determine what kind of form data a PDF document contains, and I think that we should ensure that the return value of `formInfo` only consists of "simple" data. With these changes the `fieldObjects` getter instead has to look-up the /Fields manually, however that shouldn't be a problem since the access is guarded by a `formInfo.hasFields` check which ensures that the data both exists and is valid. Furthermore, most documents doesn't even have any /AcroForm data anyway. - Determine the `hasFields` property *first*, to ensure that it's always correct even if there's errors when checking e.g. the /XFA or /SigFlags entires, since the `fieldObjects` getter depends on it. - Simplify a loop in `fieldObjects`, since the object being accessed is a `Map` and those have built-in iteration support. - Use a higher logging level for errors in the `formInfo` getter, and include the actual error message, since that'd have helped with fixing PR 12479 a lot quicker. - Update the JSDoc comment in `src/display/api.js` to list the return values correctly, and also slightly extend/improve the description. --- src/core/document.js | 23 ++++++++++------------- src/display/api.js | 5 +++-- test/unit/document_spec.js | 18 +++++++++--------- 3 files changed, 22 insertions(+), 24 deletions(-) diff --git a/src/core/document.js b/src/core/document.js index ee194660a..8702ec21d 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -708,20 +708,23 @@ class PDFDocument { } get formInfo() { - const formInfo = { hasAcroForm: false, hasXfa: false, fields: null }; + const formInfo = { hasFields: false, hasAcroForm: false, hasXfa: false }; const acroForm = this.catalog.acroForm; if (!acroForm) { return shadow(this, "formInfo", formInfo); } try { + const fields = acroForm.get("Fields"); + const hasFields = Array.isArray(fields) && fields.length > 0; + formInfo.hasFields = hasFields; // Used by the `fieldObjects` getter. + // The document contains XFA data if the `XFA` entry is a non-empty // array or stream. const xfa = acroForm.get("XFA"); - const hasXfa = + formInfo.hasXfa = (Array.isArray(xfa) && xfa.length > 0) || (isStream(xfa) && !xfa.isEmpty); - formInfo.hasXfa = hasXfa; // The document contains AcroForm data if the `Fields` entry is a // non-empty array and it doesn't consist of only document signatures. @@ -730,20 +733,15 @@ class PDFDocument { // store (invisible) document signatures. This can be detected using // the first bit of the `SigFlags` integer (see Table 219 in the // specification). - const fields = acroForm.get("Fields"); - const hasFields = Array.isArray(fields) && fields.length > 0; const sigFlags = acroForm.get("SigFlags"); const hasOnlyDocumentSignatures = !!(sigFlags & 0x1) && this._hasOnlyDocumentSignatures(fields); formInfo.hasAcroForm = hasFields && !hasOnlyDocumentSignatures; - if (hasFields) { - formInfo.fields = fields; - } } catch (ex) { if (ex instanceof MissingDataException) { throw ex; } - info("Cannot fetch form information."); + warn(`Cannot fetch form information: "${ex}".`); } return shadow(this, "formInfo", formInfo); } @@ -976,19 +974,18 @@ class PDFDocument { } get fieldObjects() { - const formInfo = this.formInfo; - if (!formInfo.fields) { + if (!this.formInfo.hasFields) { return shadow(this, "fieldObjects", Promise.resolve(null)); } const allFields = Object.create(null); const fieldPromises = new Map(); - for (const fieldRef of formInfo.fields) { + for (const fieldRef of this.catalog.acroForm.get("Fields")) { this._collectFieldObjects("", fieldRef, fieldPromises); } const allPromises = []; - for (const [name, promises] of fieldPromises.entries()) { + for (const [name, promises] of fieldPromises) { allPromises.push( Promise.all(promises).then(fields => { fields = fields.filter(field => field !== null); diff --git a/src/display/api.js b/src/display/api.js index 335e050d1..d8ab4ec64 100644 --- a/src/display/api.js +++ b/src/display/api.js @@ -878,8 +878,9 @@ class PDFDocumentProxy { } /** - * @returns {Promise<Array<Object>>} A promise that is resolved with an - * {Array<Object>} containing field data for the JS sandbox. + * @returns {Promise<Array<Object> | null>} A promise that is resolved with an + * {Array<Object>} containing /AcroForm field data for the JS sandbox, + * or `null` when no field data is present in the PDF file. */ getFieldObjects() { return this._transport.getFieldObjects(); diff --git a/test/unit/document_spec.js b/test/unit/document_spec.js index 5224cbd16..be58d85c9 100644 --- a/test/unit/document_spec.js +++ b/test/unit/document_spec.js @@ -64,7 +64,7 @@ describe("document", function () { expect(pdfDocument.formInfo).toEqual({ hasAcroForm: false, hasXfa: false, - fields: null, + hasFields: false, }); }); @@ -77,7 +77,7 @@ describe("document", function () { expect(pdfDocument.formInfo).toEqual({ hasAcroForm: false, hasXfa: false, - fields: null, + hasFields: false, }); acroForm.set("XFA", ["foo", "bar"]); @@ -85,7 +85,7 @@ describe("document", function () { expect(pdfDocument.formInfo).toEqual({ hasAcroForm: false, hasXfa: true, - fields: null, + hasFields: false, }); acroForm.set("XFA", new StringStream("")); @@ -93,7 +93,7 @@ describe("document", function () { expect(pdfDocument.formInfo).toEqual({ hasAcroForm: false, hasXfa: false, - fields: null, + hasFields: false, }); acroForm.set("XFA", new StringStream("non-empty")); @@ -101,7 +101,7 @@ describe("document", function () { expect(pdfDocument.formInfo).toEqual({ hasAcroForm: false, hasXfa: true, - fields: null, + hasFields: false, }); }); @@ -114,7 +114,7 @@ describe("document", function () { expect(pdfDocument.formInfo).toEqual({ hasAcroForm: false, hasXfa: false, - fields: null, + hasFields: false, }); acroForm.set("Fields", ["foo", "bar"]); @@ -122,7 +122,7 @@ describe("document", function () { expect(pdfDocument.formInfo).toEqual({ hasAcroForm: true, hasXfa: false, - fields: ["foo", "bar"], + hasFields: true, }); // If the first bit of the `SigFlags` entry is set and the `Fields` array @@ -133,7 +133,7 @@ describe("document", function () { expect(pdfDocument.formInfo).toEqual({ hasAcroForm: true, hasXfa: false, - fields: ["foo", "bar"], + hasFields: true, }); const annotationDict = new Dict(); @@ -156,7 +156,7 @@ describe("document", function () { expect(pdfDocument.formInfo).toEqual({ hasAcroForm: false, hasXfa: false, - fields: [kidsRef], + hasFields: true, }); }); }); From 29af15f37e364701289fe9f0cfe17872264524b3 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald <jonas.jenwald@gmail.com> Date: Fri, 16 Oct 2020 12:57:01 +0200 Subject: [PATCH 2/2] Add more validation in the `PDFDocument._hasOnlyDocumentSignatures` method If this method is ever passed invalid/unexpected data, or if during the course of parsing (since it's used recursively) such data is found, it will fail in a non-graceful way. Hence this patch, which ensures that we don't attempt to access non-existent properties and also that errors such as the one fixed in PR 12479 wouldn't have occured. --- src/core/document.js | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/core/document.js b/src/core/document.js index 8702ec21d..ca5030402 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -687,8 +687,15 @@ class PDFDocument { */ _hasOnlyDocumentSignatures(fields, recursionDepth = 0) { const RECURSION_LIMIT = 10; + + if (!Array.isArray(fields)) { + return false; + } return fields.every(field => { field = this.xref.fetchIfRef(field); + if (!(field instanceof Dict)) { + return false; + } if (field.has("Kids")) { if (++recursionDepth > RECURSION_LIMIT) { warn("_hasOnlyDocumentSignatures: maximum recursion depth reached"); @@ -937,6 +944,9 @@ class PDFDocument { : clearPrimitiveCaches(); } + /** + * @private + */ _collectFieldObjects(name, fieldRef, promises) { const field = this.xref.fetchIfRef(fieldRef); if (field.has("T")) {