From d0c4bbd8287a58856c04b0e1201f64ddccf57199 Mon Sep 17 00:00:00 2001
From: Jonas Jenwald <jonas.jenwald@gmail.com>
Date: Thu, 25 Nov 2021 18:34:11 +0100
Subject: [PATCH] [api-minor] Validate the /Pages-tree /Count entry during
 document initialization (issue 14303)

*This patch basically extends the approach from PR 10392, by also checking the last page.*

Currently, in e.g. the `Catalog.numPages`-getter, we're simply assuming that if the /Pages-tree has an *integer* /Count entry it must also be correct/valid.
As can be seen in the referenced PDF documents, that entry may be completely bogus which causes general parsing to breaking down elsewhere in the worker-thread (and hanging the browser).

Rather than hoping that the /Count entry is correct, similar to all other data found in PDF documents, we obviously need to validate it. This turns out to be a little less straightforward than one would like, since the only way to do this (as far as I know) is to parse the *entire* /Pages-tree and essentially counting the pages.
To avoid doing that for all documents, this patch tries to take a short-cut by checking if the last page (based on the /Count entry) can be successfully fetched. If so, we assume that the /Count entry is correct and use it as-is, otherwise we'll iterate through (potentially) the *entire* /Pages-tree to determine the number of pages.

Unfortunately these changes will have a number of *somewhat* negative side-effects, please see a possibly incomplete list below, however I cannot see a better way to address this bug.
 - This will slow down initial loading/rendering of all documents, at least by some amount, since we now need to fetch/parse more of the /Pages-tree in order to be able to access the *last* page of the PDF documents.
 - For poorly generated PDF documents, where the entire /Pages-tree only has *one* level, we'll unfortunately need to fetch/parse the *entire* /Pages-tree to get to the last page. While there's a cache to help reduce repeated data lookups, this will affect initial loading/rendering of *some* long PDF documents,
 - This will affect the `disableAutoFetch = true` mode negatively, since we now need to fetch/parse more data during document initialization. While the `disableAutoFetch = true` mode should still be helpful in larger/longer PDF documents, for smaller ones the effect/usefulness may unfortunately be lost.

As one *small* additional bonus, we should now also be able to support opening PDF documents where the /Pages-tree /Count entry is completely invalid (e.g. contains a non-integer value).

Fixes two of the issues listed in issue 14303, namely the `poppler-67295-0.pdf` and `poppler-85140-0.pdf` documents.
---
 src/core/catalog.js           | 26 ++++++++++---
 src/core/core_utils.js        |  7 ++++
 src/core/document.js          | 69 +++++++++++++++++++++++++++++++++--
 src/core/worker.js            | 11 +++---
 test/pdfs/.gitignore          |  2 +
 test/pdfs/poppler-67295-0.pdf | 40 ++++++++++++++++++++
 test/pdfs/poppler-85140-0.pdf | 36 ++++++++++++++++++
 test/unit/api_spec.js         | 40 +++++++++++++++++++-
 8 files changed, 215 insertions(+), 16 deletions(-)
 create mode 100644 test/pdfs/poppler-67295-0.pdf
 create mode 100644 test/pdfs/poppler-85140-0.pdf

diff --git a/src/core/catalog.js b/src/core/catalog.js
index 5bc73d8af..9bad3bd47 100644
--- a/src/core/catalog.js
+++ b/src/core/catalog.js
@@ -28,6 +28,7 @@ import {
 import {
   collectActions,
   MissingDataException,
+  PageDictMissingException,
   recoverJsURL,
   toRomanNumerals,
 } from "./core_utils.js";
@@ -70,6 +71,7 @@ class Catalog {
     if (!isDict(this._catDict)) {
       throw new FormatError("Catalog object is not a dictionary.");
     }
+    this._actualNumPages = null;
 
     this.fontCache = new RefSetCache();
     this.builtInCMapCache = new Map();
@@ -534,14 +536,26 @@ class Catalog {
     };
   }
 
-  get numPages() {
+  setActualNumPages(num = null) {
+    this._actualNumPages = num;
+  }
+
+  get hasActualNumPages() {
+    return this._actualNumPages !== null;
+  }
+
+  get _pagesCount() {
     const obj = this.toplevelPagesDict.get("Count");
     if (!Number.isInteger(obj)) {
       throw new FormatError(
         "Page count in top-level pages dictionary is not an integer."
       );
     }
-    return shadow(this, "numPages", obj);
+    return shadow(this, "_pagesCount", obj);
+  }
+
+  get numPages() {
+    return this.hasActualNumPages ? this._actualNumPages : this._pagesCount;
   }
 
   get destinations() {
@@ -1071,7 +1085,7 @@ class Catalog {
     });
   }
 
-  getPageDict(pageIndex) {
+  getPageDict(pageIndex, skipCount = false) {
     const capability = createPromiseCapability();
     const nodesToVisit = [this._catDict.getRaw("Pages")];
     const visitedNodes = new RefSet();
@@ -1133,7 +1147,7 @@ class Catalog {
         }
 
         count = currentNode.get("Count");
-        if (Number.isInteger(count) && count >= 0) {
+        if (Number.isInteger(count) && count >= 0 && !skipCount) {
           // Cache the Kids count, since it can reduce redundant lookups in
           // documents where all nodes are found at *one* level of the tree.
           const objId = currentNode.objId;
@@ -1177,7 +1191,9 @@ class Catalog {
           nodesToVisit.push(kids[last]);
         }
       }
-      capability.reject(new Error(`Page index ${pageIndex} not found.`));
+      capability.reject(
+        new PageDictMissingException(`Page index ${pageIndex} not found.`)
+      );
     }
     next();
     return capability.promise;
diff --git a/src/core/core_utils.js b/src/core/core_utils.js
index bea871789..0fd7f3014 100644
--- a/src/core/core_utils.js
+++ b/src/core/core_utils.js
@@ -60,6 +60,12 @@ class MissingDataException extends BaseException {
   }
 }
 
+class PageDictMissingException extends BaseException {
+  constructor(msg) {
+    super(msg, "PageDictMissingException");
+  }
+}
+
 class ParserEOFException extends BaseException {
   constructor(msg) {
     super(msg, "ParserEOFException");
@@ -541,6 +547,7 @@ export {
   isWhiteSpace,
   log2,
   MissingDataException,
+  PageDictMissingException,
   ParserEOFException,
   parseXFAPath,
   readInt8,
diff --git a/src/core/document.js b/src/core/document.js
index 3cee33389..bc33b1952 100644
--- a/src/core/document.js
+++ b/src/core/document.js
@@ -50,6 +50,7 @@ import {
   getInheritableProperty,
   isWhiteSpace,
   MissingDataException,
+  PageDictMissingException,
   validateCSSFont,
   XRefEntryException,
   XRefParseException,
@@ -787,7 +788,9 @@ class PDFDocument {
 
   get numPages() {
     let num = 0;
-    if (this.xfaFactory) {
+    if (this.catalog.hasActualNumPages) {
+      num = this.catalog.numPages;
+    } else if (this.xfaFactory) {
       // num is a Promise.
       num = this.xfaFactory.getNumPages();
     } else if (this.linearization) {
@@ -1331,8 +1334,13 @@ class PDFDocument {
     return promise;
   }
 
-  checkFirstPage() {
-    return this.getPage(0).catch(async reason => {
+  async checkFirstPage(recoveryMode = false) {
+    if (recoveryMode) {
+      return;
+    }
+    try {
+      await this.getPage(0);
+    } catch (reason) {
       if (reason instanceof XRefEntryException) {
         // Clear out the various caches to ensure that we haven't stored any
         // inconsistent and/or incorrect state, since that could easily break
@@ -1342,7 +1350,60 @@ class PDFDocument {
 
         throw new XRefParseException();
       }
-    });
+    }
+  }
+
+  async checkLastPage(recoveryMode = false) {
+    this.catalog.setActualNumPages(); // Ensure that it's always reset.
+    let numPages;
+
+    try {
+      await Promise.all([
+        this.pdfManager.ensureDoc("xfaFactory"),
+        this.pdfManager.ensureDoc("linearization"),
+        this.pdfManager.ensureCatalog("numPages"),
+      ]);
+
+      if (this.xfaFactory) {
+        return; // The Page count is always calculated for XFA-documents.
+      } else if (this.linearization) {
+        numPages = this.linearization.numPages;
+      } else {
+        numPages = this.catalog.numPages;
+      }
+
+      if (numPages === 1) {
+        return;
+      } else if (!Number.isInteger(numPages)) {
+        throw new FormatError("Page count is not an integer.");
+      }
+      await this.getPage(numPages - 1);
+    } catch (reason) {
+      warn(`checkLastPage - invalid /Pages tree /Count: ${numPages}.`);
+      // Clear out the various caches to ensure that we haven't stored any
+      // inconsistent and/or incorrect state, since that could easily break
+      // subsequent `this.getPage` calls.
+      await this.cleanup();
+
+      let pageIndex = 1; // The first page was already loaded.
+      while (true) {
+        try {
+          await this.getPage(pageIndex, /* skipCount = */ true);
+        } catch (reasonLoop) {
+          if (reasonLoop instanceof PageDictMissingException) {
+            break;
+          }
+          if (reasonLoop instanceof XRefEntryException) {
+            if (!recoveryMode) {
+              throw new XRefParseException();
+            }
+            break;
+          }
+        }
+        pageIndex++;
+      }
+      this.catalog.setActualNumPages(pageIndex);
+    }
   }
 
   fontFallback(id, handler) {
diff --git a/src/core/worker.js b/src/core/worker.js
index f6b41df99..24694c0a8 100644
--- a/src/core/worker.js
+++ b/src/core/worker.js
@@ -170,11 +170,12 @@ class WorkerMessageHandler {
       await pdfManager.ensureDoc("parseStartXRef");
       await pdfManager.ensureDoc("parse", [recoveryMode]);
 
-      if (!recoveryMode) {
-        // Check that at least the first page can be successfully loaded,
-        // since otherwise the XRef table is definitely not valid.
-        await pdfManager.ensureDoc("checkFirstPage");
-      }
+      // Check that at least the first page can be successfully loaded,
+      // since otherwise the XRef table is definitely not valid.
+      await pdfManager.ensureDoc("checkFirstPage", [recoveryMode]);
+      // Check that the last page can be sucessfully loaded, to ensure that
+      // `numPages` is correct, and fallback to walking the entire /Pages-tree.
+      await pdfManager.ensureDoc("checkLastPage", [recoveryMode]);
 
       const isPureXfa = await pdfManager.ensureDoc("isPureXfa");
       if (isPureXfa) {
diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore
index 889b2a104..c4fcf9d30 100644
--- a/test/pdfs/.gitignore
+++ b/test/pdfs/.gitignore
@@ -490,3 +490,5 @@
 !PDFBOX-4352-0.pdf
 !REDHAT-1531897-0.pdf
 !xfa_issue14315.pdf
+!poppler-67295-0.pdf
+!poppler-85140-0.pdf
diff --git a/test/pdfs/poppler-67295-0.pdf b/test/pdfs/poppler-67295-0.pdf
new file mode 100644
index 000000000..eb54bf85d
--- /dev/null
+++ b/test/pdfs/poppler-67295-0.pdf
@@ -0,0 +1,40 @@
+%PDF-1.2
+1 0 obj
+<</Type /Catalog /Outlines 2 0 R /Pages 6 0 R>>
+endobj
+2 0 obj
+<</Type /Outlines /Count 0>>
+endobj
+3 0 obj
+<</Length 44>>
+stream
+BT /F1 24 Tf 20 750 Td (TestString123) Tj ET
+endstream
+endobj
+4 0 obj
+[/PDF /Text]
+endobj
+5 0 obj
+<</Type /Font /Subtype /Type1 /Name /F1 /BaseFont /Helvetica /Encoding /MacRomanEncoding>>
+endobj
+6 0 obj
+<</Type /Pages /Kids [7 0 R] /Count 9999999999>>
+endobj
+7 0 obj
+<</Type /Page /Parent 6 0 R /MediaBox [0 0 612 792] /Contents 3 0 R /Resources <</ProcSet 4 0 R /Font <</F1 5 0 R>>>>>>
+endobj
+xref
+0 8
+0000000000 65535 f
+0000000010 00000 n
+0000000076 00000 n
+0000000123 00000 n
+0000000221 00000 n
+0000000252 00000 n
+0000000361 00000 n
+0000000428 00000 n
+trailer
+<</Size 8 /Root 1 0 R>>
+startxref
+566
+%%EOF
diff --git a/test/pdfs/poppler-85140-0.pdf b/test/pdfs/poppler-85140-0.pdf
new file mode 100644
index 000000000..5ae8023b1
--- /dev/null
+++ b/test/pdfs/poppler-85140-0.pdf
@@ -0,0 +1,36 @@
+%PDF-1.4
+1 0 obj
+<<
+	/Type /Catalog
+	/Pages 2 0 R
+>>
+endobj
+2 0 obj
+<<
+	/Type /Pages
+	/Kids [3 0 R]
+	/Count 213804087
+>>
+endobj
+3 18446744073709551616 obj
+<<
+	/Type /Page
+	/Parent 2 0 R
+	/MediaBox [0 0 595 2147483647]
+	/Contents 4 0 R
+>>
+endobj
+4 233245 obj
+<<
+>>
+stream
+endstream
+endobj
+5 0 obj
+>>
+endobj
+trailer
+<< 
+	/Root 1 0 R
+>>
+%%EOF
diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js
index f61d19a35..c8d98acf3 100644
--- a/test/unit/api_spec.js
+++ b/test/unit/api_spec.js
@@ -25,6 +25,7 @@ import {
   PasswordResponses,
   PermissionFlag,
   StreamType,
+  UnknownErrorException,
 } from "../../src/shared/util.js";
 import {
   buildGetDocumentParams,
@@ -478,6 +479,38 @@ describe("api", function () {
 
       await loadingTask.destroy();
     });
+
+    it("creates pdf doc from PDF files, with bad /Pages tree /Count", async function () {
+      const loadingTask1 = getDocument(
+        buildGetDocumentParams("poppler-67295-0.pdf")
+      );
+      const loadingTask2 = getDocument(
+        buildGetDocumentParams("poppler-85140-0.pdf")
+      );
+      expect(loadingTask1 instanceof PDFDocumentLoadingTask).toEqual(true);
+      expect(loadingTask2 instanceof PDFDocumentLoadingTask).toEqual(true);
+
+      const pdfDocument1 = await loadingTask1.promise;
+      const pdfDocument2 = await loadingTask2.promise;
+
+      expect(pdfDocument1.numPages).toEqual(1);
+      expect(pdfDocument2.numPages).toEqual(1);
+
+      const pageA = await pdfDocument1.getPage(1);
+      expect(pageA instanceof PDFPageProxy).toEqual(true);
+
+      try {
+        await pdfDocument2.getPage(1);
+
+        // Shouldn't get here.
+        expect(false).toEqual(true);
+      } catch (reason) {
+        expect(reason instanceof UnknownErrorException).toEqual(true);
+        expect(reason.message).toEqual("Bad (uncompressed) XRef entry: 3R");
+      }
+
+      await Promise.all([loadingTask1.destroy(), loadingTask2.destroy()]);
+    });
   });
 
   describe("PDFWorker", function () {
@@ -683,7 +716,7 @@ describe("api", function () {
             throw new Error("shall fail for invalid page");
           },
           function (reason) {
-            expect(reason instanceof Error).toEqual(true);
+            expect(reason instanceof UnknownErrorException).toEqual(true);
             expect(reason.message).toEqual(
               "Pages tree contains circular reference."
             );
@@ -724,7 +757,10 @@ describe("api", function () {
         // Shouldn't get here.
         expect(false).toEqual(true);
       } catch (reason) {
-        expect(reason instanceof Error).toEqual(true);
+        expect(reason instanceof UnknownErrorException).toEqual(true);
+        expect(reason.message).toEqual(
+          "The reference does not point to a /Page dictionary."
+        );
       }
     });