From ad3a271fc40c71b1a1c8ac2ca4c09bc31e4a5df3 Mon Sep 17 00:00:00 2001
From: Jonas Jenwald <jonas.jenwald@gmail.com>
Date: Thu, 2 Dec 2021 16:40:31 +0100
Subject: [PATCH] [api-minor] Clear all caches in `XRef.indexObjects`, and
 improve /Root dictionary validation in `XRef.parse` (issue 14303)

*This patch improves handling of a couple of PDF documents from issue 14303.*

 - Update `XRef.indexObjects` to actually clear *all* XRef-caches. Invalid XRef tables *usually* cause issues early enough during parsing that we've not populated the XRef-cache, however to prevent any issues we obviously need to clear that one as well.

 - Improve the /Root dictionary validation in `XRef.parse` (PR 9827 follow-up). In addition to checking that a /Pages entry exists, we'll now also check that it can be successfully fetched *and* that it's of the correct type. There's really no point trying to use a /Root dictionary that e.g. `Catalog.toplevelPagesDict` will reject, and this way we'll be able to fallback to indexing the objects in corrupt documents.

 - Throw an `InvalidPDFException`, rather than a general `FormatError`, in `XRef.parse` when no usable /Root dictionary could be found. That really seems more appropriate overall, since all attempts at parsing/recovery have failed. (This part of the patch is API-observable, hence the tag.)

With these changes, two existing test-cases are improved and the unit-tests are updated/re-factored to highlight that. In particular `GHOSTSCRIPT-698804-1-fuzzed.pdf` will now both load and "render" correctly, whereas `poppler-395-0-fuzzed.pdf` will now fail immediately upon loading (rather than *appearing* to work).
---
 src/core/xref.js      | 25 +++++++++----
 test/unit/api_spec.js | 84 ++++++++++++++++++++++++-------------------
 2 files changed, 66 insertions(+), 43 deletions(-)

diff --git a/src/core/xref.js b/src/core/xref.js
index 7c2c2172a..acd3ec3a4 100644
--- a/src/core/xref.js
+++ b/src/core/xref.js
@@ -107,14 +107,26 @@ class XRef {
       }
       warn(`XRef.parse - Invalid "Root" reference: "${ex}".`);
     }
-    if (root instanceof Dict && root.has("Pages")) {
-      this.root = root;
-    } else {
-      if (!recoveryMode) {
-        throw new XRefParseException();
+    if (root instanceof Dict) {
+      try {
+        const pages = root.get("Pages");
+        if (pages instanceof Dict) {
+          this.root = root;
+          return;
+        }
+      } catch (ex) {
+        if (ex instanceof MissingDataException) {
+          throw ex;
+        }
+        warn(`XRef.parse - Invalid "Pages" reference: "${ex}".`);
       }
-      throw new FormatError("Invalid root reference");
     }
+
+    if (!recoveryMode) {
+      throw new XRefParseException();
+    }
+    // Even recovery failed, there's nothing more we can do here.
+    throw new InvalidPDFException("Invalid Root reference.");
   }
 
   processXRefTable(parser) {
@@ -417,6 +429,7 @@ class XRef {
 
     // Clear out any existing entries, since they may be bogus.
     this.entries.length = 0;
+    this._cacheMap.clear();
 
     const stream = this.stream;
     stream.pos = 0;
diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js
index 81b9f0d48..a5552d52e 100644
--- a/test/unit/api_spec.js
+++ b/test/unit/api_spec.js
@@ -445,7 +445,7 @@ describe("api", function () {
       await Promise.all([loadingTask1.destroy(), loadingTask2.destroy()]);
     });
 
-    it("creates pdf doc from PDF file with bad XRef table", async function () {
+    it("creates pdf doc from PDF file with bad XRef entry", async function () {
       // A corrupt PDF file, where the XRef table have (some) bogus entries.
       const loadingTask = getDocument(
         buildGetDocumentParams("PDFBOX-4352-0.pdf", {
@@ -468,6 +468,26 @@ describe("api", function () {
       await loadingTask.destroy();
     });
 
+    it("creates pdf doc from PDF file with bad XRef header", async function () {
+      const loadingTask = getDocument(
+        buildGetDocumentParams("GHOSTSCRIPT-698804-1-fuzzed.pdf")
+      );
+      expect(loadingTask instanceof PDFDocumentLoadingTask).toEqual(true);
+
+      const pdfDocument = await loadingTask.promise;
+      expect(pdfDocument.numPages).toEqual(1);
+
+      const page = await pdfDocument.getPage(1);
+      expect(page instanceof PDFPageProxy).toEqual(true);
+
+      const opList = await page.getOperatorList();
+      expect(opList.fnArray.length).toEqual(0);
+      expect(opList.argsArray.length).toEqual(0);
+      expect(opList.lastChunk).toEqual(true);
+
+      await loadingTask.destroy();
+    });
+
     it("creates pdf doc from PDF file with bad XRef byteWidths", async function () {
       // A corrupt PDF file, where the XRef /W-array have (some) bogus entries.
       const loadingTask = getDocument(
@@ -488,6 +508,25 @@ describe("api", function () {
       await loadingTask.destroy();
     });
 
+    it("creates pdf doc from PDF file with inaccessible /Pages tree", async function () {
+      const loadingTask = getDocument(
+        buildGetDocumentParams("poppler-395-0-fuzzed.pdf")
+      );
+      expect(loadingTask instanceof PDFDocumentLoadingTask).toEqual(true);
+
+      try {
+        await loadingTask.promise;
+
+        // Shouldn't get here.
+        expect(false).toEqual(true);
+      } catch (reason) {
+        expect(reason instanceof InvalidPDFException).toEqual(true);
+        expect(reason.message).toEqual("Invalid Root reference.");
+      }
+
+      await loadingTask.destroy();
+    });
+
     it("creates pdf doc from PDF files, with bad /Pages tree /Count", async function () {
       const loadingTask1 = getDocument(
         buildGetDocumentParams("poppler-67295-0.pdf")
@@ -495,30 +534,23 @@ describe("api", function () {
       const loadingTask2 = getDocument(
         buildGetDocumentParams("poppler-85140-0.pdf")
       );
-      const loadingTask3 = getDocument(
-        buildGetDocumentParams("poppler-395-0-fuzzed.pdf")
-      );
-      const loadingTask4 = getDocument(
-        buildGetDocumentParams("GHOSTSCRIPT-698804-1-fuzzed.pdf")
-      );
 
       expect(loadingTask1 instanceof PDFDocumentLoadingTask).toEqual(true);
       expect(loadingTask2 instanceof PDFDocumentLoadingTask).toEqual(true);
-      expect(loadingTask3 instanceof PDFDocumentLoadingTask).toEqual(true);
-      expect(loadingTask4 instanceof PDFDocumentLoadingTask).toEqual(true);
 
       const pdfDocument1 = await loadingTask1.promise;
       const pdfDocument2 = await loadingTask2.promise;
-      const pdfDocument3 = await loadingTask3.promise;
-      const pdfDocument4 = await loadingTask4.promise;
 
       expect(pdfDocument1.numPages).toEqual(1);
       expect(pdfDocument2.numPages).toEqual(1);
-      expect(pdfDocument3.numPages).toEqual(1);
-      expect(pdfDocument4.numPages).toEqual(1);
 
-      const pageA = await pdfDocument1.getPage(1);
-      expect(pageA instanceof PDFPageProxy).toEqual(true);
+      const page = await pdfDocument1.getPage(1);
+      expect(page instanceof PDFPageProxy).toEqual(true);
+
+      const opList = await page.getOperatorList();
+      expect(opList.fnArray.length).toBeGreaterThan(5);
+      expect(opList.argsArray.length).toBeGreaterThan(5);
+      expect(opList.lastChunk).toEqual(true);
 
       try {
         await pdfDocument2.getPage(1);
@@ -529,28 +561,6 @@ describe("api", function () {
         expect(reason instanceof UnknownErrorException).toEqual(true);
         expect(reason.message).toEqual("Bad (uncompressed) XRef entry: 3R");
       }
-      try {
-        await pdfDocument3.getPage(1);
-
-        // Shouldn't get here.
-        expect(false).toEqual(true);
-      } catch (reason) {
-        expect(reason instanceof UnknownErrorException).toEqual(true);
-        expect(reason.message).toEqual(
-          "Page dictionary kid reference points to wrong type of object."
-        );
-      }
-      try {
-        await pdfDocument4.getPage(1);
-
-        // Shouldn't get here.
-        expect(false).toEqual(true);
-      } catch (reason) {
-        expect(reason instanceof UnknownErrorException).toEqual(true);
-        expect(reason.message).toEqual(
-          "Page dictionary kid reference points to wrong type of object."
-        );
-      }
 
       await Promise.all([loadingTask1.destroy(), loadingTask2.destroy()]);
     });