From 3351d3476d1f3bf6ff9098d267fbba0ff4f3257a Mon Sep 17 00:00:00 2001
From: Jonas Jenwald <jonas.jenwald@gmail.com>
Date: Fri, 16 Oct 2020 12:20:44 +0200
Subject: [PATCH 1/2] Don't store complex data in `PDFDocument.formInfo`, and
 replace the `fields` object with a `hasFields` boolean instead

*This patch is based on a couple of smaller things that I noticed when working on PR 12479.*

 - Don't store the /Fields on the `formInfo` getter, since that feels like overloading it with unintended (and too complex) data, and utilize a `hasFields` boolean instead.
   This functionality was originally added in PR 12271, to help determine what kind of form data a PDF document contains, and I think that we should ensure that the return value of `formInfo` only consists of "simple" data.
   With these changes the `fieldObjects` getter instead has to look-up the /Fields manually, however that shouldn't be a problem since the access is guarded by a `formInfo.hasFields` check which ensures that the data both exists and is valid. Furthermore, most documents doesn't even have any /AcroForm data anyway.

 - Determine the `hasFields` property *first*, to ensure that it's always correct even if there's errors when checking e.g. the /XFA or /SigFlags entires, since the `fieldObjects` getter depends on it.

 - Simplify a loop in `fieldObjects`, since the object being accessed is a `Map` and those have built-in iteration support.

 - Use a higher logging level for errors in the `formInfo` getter, and include the actual error message, since that'd have helped with fixing PR 12479 a lot quicker.

 - Update the JSDoc comment in `src/display/api.js` to list the return values correctly, and also slightly extend/improve the description.
---
 src/core/document.js       | 23 ++++++++++-------------
 src/display/api.js         |  5 +++--
 test/unit/document_spec.js | 18 +++++++++---------
 3 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/src/core/document.js b/src/core/document.js
index ee194660a..8702ec21d 100644
--- a/src/core/document.js
+++ b/src/core/document.js
@@ -708,20 +708,23 @@ class PDFDocument {
   }
 
   get formInfo() {
-    const formInfo = { hasAcroForm: false, hasXfa: false, fields: null };
+    const formInfo = { hasFields: false, hasAcroForm: false, hasXfa: false };
     const acroForm = this.catalog.acroForm;
     if (!acroForm) {
       return shadow(this, "formInfo", formInfo);
     }
 
     try {
+      const fields = acroForm.get("Fields");
+      const hasFields = Array.isArray(fields) && fields.length > 0;
+      formInfo.hasFields = hasFields; // Used by the `fieldObjects` getter.
+
       // The document contains XFA data if the `XFA` entry is a non-empty
       // array or stream.
       const xfa = acroForm.get("XFA");
-      const hasXfa =
+      formInfo.hasXfa =
         (Array.isArray(xfa) && xfa.length > 0) ||
         (isStream(xfa) && !xfa.isEmpty);
-      formInfo.hasXfa = hasXfa;
 
       // The document contains AcroForm data if the `Fields` entry is a
       // non-empty array and it doesn't consist of only document signatures.
@@ -730,20 +733,15 @@ class PDFDocument {
       // store (invisible) document signatures. This can be detected using
       // the first bit of the `SigFlags` integer (see Table 219 in the
       // specification).
-      const fields = acroForm.get("Fields");
-      const hasFields = Array.isArray(fields) && fields.length > 0;
       const sigFlags = acroForm.get("SigFlags");
       const hasOnlyDocumentSignatures =
         !!(sigFlags & 0x1) && this._hasOnlyDocumentSignatures(fields);
       formInfo.hasAcroForm = hasFields && !hasOnlyDocumentSignatures;
-      if (hasFields) {
-        formInfo.fields = fields;
-      }
     } catch (ex) {
       if (ex instanceof MissingDataException) {
         throw ex;
       }
-      info("Cannot fetch form information.");
+      warn(`Cannot fetch form information: "${ex}".`);
     }
     return shadow(this, "formInfo", formInfo);
   }
@@ -976,19 +974,18 @@ class PDFDocument {
   }
 
   get fieldObjects() {
-    const formInfo = this.formInfo;
-    if (!formInfo.fields) {
+    if (!this.formInfo.hasFields) {
       return shadow(this, "fieldObjects", Promise.resolve(null));
     }
 
     const allFields = Object.create(null);
     const fieldPromises = new Map();
-    for (const fieldRef of formInfo.fields) {
+    for (const fieldRef of this.catalog.acroForm.get("Fields")) {
       this._collectFieldObjects("", fieldRef, fieldPromises);
     }
 
     const allPromises = [];
-    for (const [name, promises] of fieldPromises.entries()) {
+    for (const [name, promises] of fieldPromises) {
       allPromises.push(
         Promise.all(promises).then(fields => {
           fields = fields.filter(field => field !== null);
diff --git a/src/display/api.js b/src/display/api.js
index 335e050d1..d8ab4ec64 100644
--- a/src/display/api.js
+++ b/src/display/api.js
@@ -878,8 +878,9 @@ class PDFDocumentProxy {
   }
 
   /**
-   * @returns {Promise<Array<Object>>} A promise that is resolved with an
-   *   {Array<Object>} containing field data for the JS sandbox.
+   * @returns {Promise<Array<Object> | null>} A promise that is resolved with an
+   *   {Array<Object>} containing /AcroForm field data for the JS sandbox,
+   *   or `null` when no field data is present in the PDF file.
    */
   getFieldObjects() {
     return this._transport.getFieldObjects();
diff --git a/test/unit/document_spec.js b/test/unit/document_spec.js
index 5224cbd16..be58d85c9 100644
--- a/test/unit/document_spec.js
+++ b/test/unit/document_spec.js
@@ -64,7 +64,7 @@ describe("document", function () {
       expect(pdfDocument.formInfo).toEqual({
         hasAcroForm: false,
         hasXfa: false,
-        fields: null,
+        hasFields: false,
       });
     });
 
@@ -77,7 +77,7 @@ describe("document", function () {
       expect(pdfDocument.formInfo).toEqual({
         hasAcroForm: false,
         hasXfa: false,
-        fields: null,
+        hasFields: false,
       });
 
       acroForm.set("XFA", ["foo", "bar"]);
@@ -85,7 +85,7 @@ describe("document", function () {
       expect(pdfDocument.formInfo).toEqual({
         hasAcroForm: false,
         hasXfa: true,
-        fields: null,
+        hasFields: false,
       });
 
       acroForm.set("XFA", new StringStream(""));
@@ -93,7 +93,7 @@ describe("document", function () {
       expect(pdfDocument.formInfo).toEqual({
         hasAcroForm: false,
         hasXfa: false,
-        fields: null,
+        hasFields: false,
       });
 
       acroForm.set("XFA", new StringStream("non-empty"));
@@ -101,7 +101,7 @@ describe("document", function () {
       expect(pdfDocument.formInfo).toEqual({
         hasAcroForm: false,
         hasXfa: true,
-        fields: null,
+        hasFields: false,
       });
     });
 
@@ -114,7 +114,7 @@ describe("document", function () {
       expect(pdfDocument.formInfo).toEqual({
         hasAcroForm: false,
         hasXfa: false,
-        fields: null,
+        hasFields: false,
       });
 
       acroForm.set("Fields", ["foo", "bar"]);
@@ -122,7 +122,7 @@ describe("document", function () {
       expect(pdfDocument.formInfo).toEqual({
         hasAcroForm: true,
         hasXfa: false,
-        fields: ["foo", "bar"],
+        hasFields: true,
       });
 
       // If the first bit of the `SigFlags` entry is set and the `Fields` array
@@ -133,7 +133,7 @@ describe("document", function () {
       expect(pdfDocument.formInfo).toEqual({
         hasAcroForm: true,
         hasXfa: false,
-        fields: ["foo", "bar"],
+        hasFields: true,
       });
 
       const annotationDict = new Dict();
@@ -156,7 +156,7 @@ describe("document", function () {
       expect(pdfDocument.formInfo).toEqual({
         hasAcroForm: false,
         hasXfa: false,
-        fields: [kidsRef],
+        hasFields: true,
       });
     });
   });

From 29af15f37e364701289fe9f0cfe17872264524b3 Mon Sep 17 00:00:00 2001
From: Jonas Jenwald <jonas.jenwald@gmail.com>
Date: Fri, 16 Oct 2020 12:57:01 +0200
Subject: [PATCH 2/2] Add more validation in the
 `PDFDocument._hasOnlyDocumentSignatures` method

If this method is ever passed invalid/unexpected data, or if during the course of parsing (since it's used recursively) such data is found, it will fail in a non-graceful way.
Hence this patch, which ensures that we don't attempt to access non-existent properties and also that errors such as the one fixed in PR 12479 wouldn't have occured.
---
 src/core/document.js | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/core/document.js b/src/core/document.js
index 8702ec21d..ca5030402 100644
--- a/src/core/document.js
+++ b/src/core/document.js
@@ -687,8 +687,15 @@ class PDFDocument {
    */
   _hasOnlyDocumentSignatures(fields, recursionDepth = 0) {
     const RECURSION_LIMIT = 10;
+
+    if (!Array.isArray(fields)) {
+      return false;
+    }
     return fields.every(field => {
       field = this.xref.fetchIfRef(field);
+      if (!(field instanceof Dict)) {
+        return false;
+      }
       if (field.has("Kids")) {
         if (++recursionDepth > RECURSION_LIMIT) {
           warn("_hasOnlyDocumentSignatures: maximum recursion depth reached");
@@ -937,6 +944,9 @@ class PDFDocument {
       : clearPrimitiveCaches();
   }
 
+  /**
+   * @private
+   */
   _collectFieldObjects(name, fieldRef, promises) {
     const field = this.xref.fetchIfRef(fieldRef);
     if (field.has("T")) {