From b513c64d9da6cf050c1b5d3b0500e367b668789e Mon Sep 17 00:00:00 2001
From: Jonas Jenwald <jonas.jenwald@gmail.com>
Date: Fri, 24 Dec 2021 13:46:35 +0100
Subject: [PATCH] [api-minor] Convert `Catalog.getPageDict` to an asynchronous
 method

Besides converting `Catalog.getPageDict` to an `async` method, thus simplifying the code, this patch also allows us to pro-actively fix a existing issue.
Note how we're looking up References in such a way that `MissingDataException`s won't cause trouble, however it's *technically possible* that the entries (i.e. /Count, /Kids, and /Type) in a /Pages Dictionary could actually be indirect objects as well. In the existing code this could lead to *some*, or even all, pages failing to load/render as intended.
In practice that doesn't *appear* to happen in real-world PDF documents, but given all the weird things that PDF software do I'd prefer to fix this pro-actively (rather than waiting for a bug report).
With `Catalog.getPageDict` being `async` this is now really simple to address, however I didn't want to introduce a bunch more *unconditional* asynchronicity in this method if it could be avoided (since that could slow things down). Hence we'll *synchronously* lookup the *raw* data in a /Pages Dictionary, and only fallback to asynchronous data lookup when a Reference was encountered.

In addition to the above, this patch also makes the following notable changes:
 - Let `Catalog.getPageDict` *consistently* reject with the actual error, regardless of what data we're fetching. Previously we'd "swallow" the actual errors except when looking up Dictionary entries, which is inconsistent and thus seem unfortunate. As can be seen from the updated unit-tests this change is API-observable, hence why the patch is tagged `[api-minor]`.

 - Improve the consistency of the Dictionary /Type-checks in both the `Catalog.getPageDict` and `Catalog.getAllPageDicts` methods.
   In `Catalog.getPageDict` there's a fallback code-path where we're *incorrectly* checking the /Page Dictionary for a /Contents-entry, which is wrong since a /Page Dictionary doesn't need to have a /Contents-entry in order to be valid.
   For consistency the `Catalog.getAllPageDicts` method is also updated to handle errors in the /Type-lookup correctly.

 - Reduce the `PagesCountLimit.PAUSE_EAGER_PAGE_INIT` viewer constant, to further improve loading/rendering performance of the *second* page during initialization of very long documents; PR 14359 follow-up.
---
 src/core/catalog.js   | 212 ++++++++++++++++++++----------------------
 test/unit/api_spec.js |   8 +-
 web/base_viewer.js    |   2 +-
 3 files changed, 102 insertions(+), 120 deletions(-)

diff --git a/src/core/catalog.js b/src/core/catalog.js
index c3777866a..54c6849ef 100644
--- a/src/core/catalog.js
+++ b/src/core/catalog.js
@@ -34,7 +34,6 @@ import {
   XRefEntryException,
 } from "./core_utils.js";
 import {
-  createPromiseCapability,
   createValidAbsoluteUrl,
   DocumentActionEventType,
   FormatError,
@@ -1091,8 +1090,7 @@ class Catalog {
     });
   }
 
-  getPageDict(pageIndex) {
-    const capability = createPromiseCapability();
+  async getPageDict(pageIndex) {
     const nodesToVisit = [this.toplevelPagesDict];
     const visitedNodes = new RefSet();
 
@@ -1104,130 +1102,105 @@ class Catalog {
       pageKidsCountCache = this.pageKidsCountCache;
     let currentPageIndex = 0;
 
-    function next() {
-      while (nodesToVisit.length) {
-        const currentNode = nodesToVisit.pop();
+    while (nodesToVisit.length) {
+      const currentNode = nodesToVisit.pop();
 
-        if (currentNode instanceof Ref) {
-          const count = pageKidsCountCache.get(currentNode);
-          // Skip nodes where the page can't be.
-          if (count >= 0 && currentPageIndex + count <= pageIndex) {
-            currentPageIndex += count;
-            continue;
+      if (currentNode instanceof Ref) {
+        const count = pageKidsCountCache.get(currentNode);
+        // Skip nodes where the page can't be.
+        if (count >= 0 && currentPageIndex + count <= pageIndex) {
+          currentPageIndex += count;
+          continue;
+        }
+        // Prevent circular references in the /Pages tree.
+        if (visitedNodes.has(currentNode)) {
+          throw new FormatError("Pages tree contains circular reference.");
+        }
+        visitedNodes.put(currentNode);
+
+        const obj = await xref.fetchAsync(currentNode);
+        if (obj instanceof Dict) {
+          let type = obj.getRaw("Type");
+          if (type instanceof Ref) {
+            type = await xref.fetchAsync(type);
           }
-          // Prevent circular references in the /Pages tree.
-          if (visitedNodes.has(currentNode)) {
-            capability.reject(
-              new FormatError("Pages tree contains circular reference.")
-            );
-            return;
-          }
-          visitedNodes.put(currentNode);
-
-          xref.fetchAsync(currentNode).then(function (obj) {
-            if (isDict(obj, "Page") || (isDict(obj) && !obj.has("Kids"))) {
-              // Cache the Page reference, since it can *greatly* improve
-              // performance by reducing redundant lookups in long documents
-              // where all nodes are found at *one* level of the tree.
-              if (currentNode && !pageKidsCountCache.has(currentNode)) {
-                pageKidsCountCache.put(currentNode, 1);
-              }
-
-              if (pageIndex === currentPageIndex) {
-                capability.resolve([obj, currentNode]);
-              } else {
-                currentPageIndex++;
-                next();
-              }
-              return;
+          if (isName(type, "Page") || !obj.has("Kids")) {
+            // Cache the Page reference, since it can *greatly* improve
+            // performance by reducing redundant lookups in long documents
+            // where all nodes are found at *one* level of the tree.
+            if (currentNode && !pageKidsCountCache.has(currentNode)) {
+              pageKidsCountCache.put(currentNode, 1);
             }
-            nodesToVisit.push(obj);
-            next();
-          }, capability.reject);
-          return;
-        }
 
-        // Must be a child page dictionary.
-        if (!(currentNode instanceof Dict)) {
-          capability.reject(
-            new FormatError(
-              "Page dictionary kid reference points to wrong type of object."
-            )
-          );
-          return;
-        }
-
-        let count;
-        try {
-          count = currentNode.get("Count");
-        } catch (ex) {
-          if (ex instanceof MissingDataException) {
-            throw ex;
-          }
-        }
-        if (Number.isInteger(count) && count >= 0) {
-          // Cache the Kids count, since it can reduce redundant lookups in
-          // documents where all nodes are found at *one* level of the tree.
-          const objId = currentNode.objId;
-          if (objId && !pageKidsCountCache.has(objId)) {
-            pageKidsCountCache.put(objId, count);
-          }
-          // Skip nodes where the page can't be.
-          if (currentPageIndex + count <= pageIndex) {
-            currentPageIndex += count;
-            continue;
-          }
-        }
-
-        let kids;
-        try {
-          kids = currentNode.get("Kids");
-        } catch (ex) {
-          if (ex instanceof MissingDataException) {
-            throw ex;
-          }
-        }
-        if (!Array.isArray(kids)) {
-          // Prevent errors in corrupt PDF documents that violate the
-          // specification by *inlining* Page dicts directly in the Kids
-          // array, rather than using indirect objects (fixes issue9540.pdf).
-          let type;
-          try {
-            type = currentNode.get("Type");
-          } catch (ex) {
-            if (ex instanceof MissingDataException) {
-              throw ex;
-            }
-          }
-          if (
-            isName(type, "Page") ||
-            (!currentNode.has("Type") && currentNode.has("Contents"))
-          ) {
             if (currentPageIndex === pageIndex) {
-              capability.resolve([currentNode, null]);
-              return;
+              return [obj, currentNode];
             }
             currentPageIndex++;
             continue;
           }
+        }
+        nodesToVisit.push(obj);
+        continue;
+      }
 
-          capability.reject(
-            new FormatError("Page dictionary kids object is not an array.")
-          );
-          return;
+      // Must be a child page dictionary.
+      if (!(currentNode instanceof Dict)) {
+        throw new FormatError(
+          "Page dictionary kid reference points to wrong type of object."
+        );
+      }
+      const { objId } = currentNode;
+
+      let count = currentNode.getRaw("Count");
+      if (count instanceof Ref) {
+        count = await xref.fetchAsync(count);
+      }
+      if (Number.isInteger(count) && count >= 0) {
+        // Cache the Kids count, since it can reduce redundant lookups in
+        // documents where all nodes are found at *one* level of the tree.
+        if (objId && !pageKidsCountCache.has(objId)) {
+          pageKidsCountCache.put(objId, count);
         }
 
-        // Always check all `Kids` nodes, to avoid getting stuck in an empty
-        // node further down in the tree (see issue5644.pdf, issue8088.pdf),
-        // and to ensure that we actually find the correct `Page` dict.
-        for (let last = kids.length - 1; last >= 0; last--) {
-          nodesToVisit.push(kids[last]);
+        // Skip nodes where the page can't be.
+        if (currentPageIndex + count <= pageIndex) {
+          currentPageIndex += count;
+          continue;
         }
       }
-      capability.reject(new Error(`Page index ${pageIndex} not found.`));
+
+      let kids = currentNode.getRaw("Kids");
+      if (kids instanceof Ref) {
+        kids = await xref.fetchAsync(kids);
+      }
+      if (!Array.isArray(kids)) {
+        // Prevent errors in corrupt PDF documents that violate the
+        // specification by *inlining* Page dicts directly in the Kids
+        // array, rather than using indirect objects (fixes issue9540.pdf).
+        let type = currentNode.getRaw("Type");
+        if (type instanceof Ref) {
+          type = await xref.fetchAsync(type);
+        }
+        if (isName(type, "Page") || !currentNode.has("Kids")) {
+          if (currentPageIndex === pageIndex) {
+            return [currentNode, null];
+          }
+          currentPageIndex++;
+          continue;
+        }
+
+        throw new FormatError("Page dictionary kids object is not an array.");
+      }
+
+      // Always check all `Kids` nodes, to avoid getting stuck in an empty
+      // node further down in the tree (see issue5644.pdf, issue8088.pdf),
+      // and to ensure that we actually find the correct `Page` dict.
+      for (let last = kids.length - 1; last >= 0; last--) {
+        nodesToVisit.push(kids[last]);
+      }
     }
-    next();
-    return capability.promise;
+
+    throw new Error(`Page index ${pageIndex} not found.`);
   }
 
   /**
@@ -1319,7 +1292,20 @@ class Catalog {
         break;
       }
 
-      if (isDict(obj, "Page") || !obj.has("Kids")) {
+      let type;
+      try {
+        type = obj.get("Type");
+      } catch (ex) {
+        if (ex instanceof MissingDataException) {
+          throw ex;
+        }
+        if (ex instanceof XRefEntryException && !recoveryMode) {
+          throw ex;
+        }
+        addPageError(ex);
+        break;
+      }
+      if (isName(type, "Page") || !obj.has("Kids")) {
         addPageDict(obj, kidObj instanceof Ref ? kidObj : null);
       } else {
         queue.push({ currentNode: obj, posInKids: 0 });
diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js
index 7eef2d2e4..24cc581bf 100644
--- a/test/unit/api_spec.js
+++ b/test/unit/api_spec.js
@@ -622,9 +622,7 @@ describe("api", function () {
         expect(false).toEqual(true);
       } catch (reason) {
         expect(reason instanceof UnknownErrorException).toEqual(true);
-        expect(reason.message).toEqual(
-          "Page dictionary kids object is not an array."
-        );
+        expect(reason.message).toEqual("Illegal character: 41");
       }
       try {
         await pdfDocument2.getPage(1);
@@ -633,9 +631,7 @@ describe("api", function () {
         expect(false).toEqual(true);
       } catch (reason) {
         expect(reason instanceof UnknownErrorException).toEqual(true);
-        expect(reason.message).toEqual(
-          "Page dictionary kids object is not an array."
-        );
+        expect(reason.message).toEqual("End of file inside array.");
       }
 
       await Promise.all([loadingTask1.destroy(), loadingTask2.destroy()]);
diff --git a/web/base_viewer.js b/web/base_viewer.js
index 757d65d12..f73d876ba 100644
--- a/web/base_viewer.js
+++ b/web/base_viewer.js
@@ -76,7 +76,7 @@ const ENABLE_PERMISSIONS_CLASS = "enablePermissions";
 const PagesCountLimit = {
   FORCE_SCROLL_MODE_PAGE: 15000,
   FORCE_LAZY_PAGE_INIT: 7500,
-  PAUSE_EAGER_PAGE_INIT: 500,
+  PAUSE_EAGER_PAGE_INIT: 250,
 };
 
 /**