Check that the first page can be successfully loaded, to try and ascertain the validity of the XRef table (issue 7496, issue 10326)

For PDF documents with sufficiently broken XRef tables, it's usually quite obvious when you need to fallback to indexing the entire file. However, for certain kinds of corrupted PDF documents the XRef table will, for all intents and purposes, appear to be valid. It's not until you actually try to fetch various objects that things will start to break, which is the case in the referenced issues[1]. Since there's generally a real effort being in made PDF.js to load even corrupt PDF documents, this patch contains a suggested approach to attempt to do a bit more validation of the XRef table during the initial document loading phase. Here the choice is made to attempt to load the *first* page, as a basic sanity check of the validity of the XRef table. Please note that attempting to load a more-or-less arbitrarily chosen object without any context of what it's supposed to be isn't a very useful, which is why this particular choice was made. Obviously, just because the first page can be loaded successfully that doesn't guarantee that the *entire* XRef table is valid, however if even the first page fails to load you can be reasonably sure that the document is *not* valid[2]. Even though this patch won't cause any significant increase in the amount of parsing required during initial loading of the document[3], it will require loading of more data upfront which thus delays the initial `getDocument` call. Whether or not this is a problem depends very much on what you actually measure, please consider the following examples: ```javascript console.time('first'); getDocument(...).promise.then((pdfDocument) => { console.timeEnd('first'); }); console.time('second'); getDocument(...).promise.then((pdfDocument) => { pdfDocument.getPage(1).then((pdfPage) => { // Note: the API uses `pageNumber >= 1`, the Worker uses `pageIndex >= 0`. console.timeEnd('second'); }); }); ``` The first case is pretty much guaranteed to show a small regression, however the second case won't be affected at all since the Worker caches the result of `getPage` calls. Again, please remember that the second case is what matters for the standard PDF.js use-case which is why I'm hoping that this patch is deemed acceptable. --- [1] In issue 7496, the problem is that the document is edited without the XRef table being correctly updated. In issue 10326, the generator was sorting the XRef table according to the offsets rather than the objects. [2] The idea of checking the first page in particular came from the "standard" use-case for the PDF.js library, i.e. the default viewer, where a failure to load the first page basically means that nothing will work; note how `{BaseViewer, PDFThumbnailViewer}.setDocument` depends completely on being able to fetch the *first* page. [3] The only extra parsing is caused by, potentially, having to traverse *part* of the `Pages` tree to find the first page.
2018-12-04 21:51:27 +01:00 · 2018-12-04 21:51:27 +01:00 · 60bcce184e
commit 60bcce184e
parent d3868e1bd1
7 changed files with 68 additions and 36 deletions
--- a/src/core/document.js
+++ b/src/core/document.js
@ -16,7 +16,7 @@
 import {
  assert, FormatError, getInheritableProperty, info, isArrayBuffer, isBool,
  isNum, isSpace, isString, MissingDataException, OPS, shadow, stringToBytes,
-  stringToPDFString, Util, warn
+  stringToPDFString, Util, warn, XRefEntryException, XRefParseException
 } from '../shared/util';
 import { Catalog, ObjectLoader, XRef } from './obj';
 import { Dict, isDict, isName, isStream, Ref } from './primitives';
@ -649,6 +649,20 @@ var PDFDocument = (function PDFDocumentClosure() {
      });
    },

+    checkFirstPage() {
+      return this.getPage(0).catch((reason) => {
+        if (reason instanceof XRefEntryException) {
+          // Clear out the various caches to ensure that we haven't stored any
+          // inconsistent and/or incorrect state, since that could easily break
+          // subsequent `this.getPage` calls.
+          this._pagePromises.length = 0;
+          this.cleanup();
+
+          throw new XRefParseException();
+        }
+      });
+    },
+
    cleanup: function PDFDocument_cleanup() {
      return this.catalog.cleanup();
    },
--- a/src/core/obj.js
+++ b/src/core/obj.js
@ -17,7 +17,7 @@ import {
  bytesToString, createPromiseCapability, createValidAbsoluteUrl, FormatError,
  info, InvalidPDFException, isBool, isNum, isString, MissingDataException,
  PermissionFlag, shadow, stringToPDFString, stringToUTF8String,
-  toRomanNumerals, unreachable, warn, XRefParseException
+  toRomanNumerals, unreachable, warn, XRefEntryException, XRefParseException
 } from '../shared/util';
 import {
  Dict, isCmd, isDict, isName, isRef, isRefsEqual, isStream, Ref, RefSet,
@ -1473,7 +1473,7 @@ var XRef = (function XRefClosure() {
      if (xrefEntry.uncompressed) {
        xrefEntry = this.fetchUncompressed(ref, xrefEntry, suppressEncryption);
      } else {
-        xrefEntry = this.fetchCompressed(xrefEntry, suppressEncryption);
+        xrefEntry = this.fetchCompressed(ref, xrefEntry, suppressEncryption);
      }
      if (isDict(xrefEntry)) {
        xrefEntry.objId = ref.toString();
@ -1483,12 +1483,11 @@ var XRef = (function XRefClosure() {
      return xrefEntry;
    },

-    fetchUncompressed: function XRef_fetchUncompressed(ref, xrefEntry,
-                                                       suppressEncryption) {
+    fetchUncompressed(ref, xrefEntry, suppressEncryption = false) {
      var gen = ref.gen;
      var num = ref.num;
      if (xrefEntry.gen !== gen) {
-        throw new FormatError('inconsistent generation in XRef');
+        throw new XRefEntryException(`Inconsistent generation in XRef: ${ref}`);
      }
      var stream = this.stream.makeSubStream(xrefEntry.offset +
                                             this.stream.start);
@ -1504,7 +1503,7 @@ var XRef = (function XRefClosure() {
        obj2 = parseInt(obj2, 10);
      }
      if (obj1 !== num || obj2 !== gen || !isCmd(obj3)) {
-        throw new FormatError('bad XRef entry');
+        throw new XRefEntryException(`Bad (uncompressed) XRef entry: ${ref}`);
      }
      if (obj3.cmd !== 'obj') {
        // some bad PDFs use "obj1234" and really mean 1234
@ -1514,7 +1513,7 @@ var XRef = (function XRefClosure() {
            return num;
          }
        }
-        throw new FormatError('bad XRef entry');
+        throw new XRefEntryException(`Bad (uncompressed) XRef entry: ${ref}`);
      }
      if (this.encrypt && !suppressEncryption) {
        xrefEntry = parser.getObj(this.encrypt.createCipherTransform(num, gen));
@ -1527,8 +1526,7 @@ var XRef = (function XRefClosure() {
      return xrefEntry;
    },

-    fetchCompressed: function XRef_fetchCompressed(xrefEntry,
-                                                   suppressEncryption) {
+    fetchCompressed(ref, xrefEntry, suppressEncryption = false) {
      var tableOffset = xrefEntry.offset;
      var stream = this.fetch(new Ref(tableOffset, 0));
      if (!isStream(stream)) {
@ -1573,7 +1571,7 @@ var XRef = (function XRefClosure() {
      }
      xrefEntry = entries[xrefEntry.gen];
      if (xrefEntry === undefined) {
-        throw new FormatError('bad XRef entry for compressed object');
+        throw new XRefEntryException(`Bad (compressed) XRef entry: ${ref}`);
      }
      return xrefEntry;
    },
--- a/src/core/worker.js
+++ b/src/core/worker.js
@ -258,33 +258,22 @@ var WorkerMessageHandler = {
      WorkerTasks.splice(i, 1);
    }

-    function loadDocument(recoveryMode) {
-      var loadDocumentCapability = createPromiseCapability();
+    async function loadDocument(recoveryMode) {
+      await pdfManager.ensureDoc('checkHeader');
+      await pdfManager.ensureDoc('parseStartXRef');
+      await pdfManager.ensureDoc('parse', [recoveryMode]);

-      var parseSuccess = function parseSuccess() {
-        Promise.all([
-          pdfManager.ensureDoc('numPages'),
-          pdfManager.ensureDoc('fingerprint'),
-        ]).then(function([numPages, fingerprint]) {
-          loadDocumentCapability.resolve({
-            numPages,
-            fingerprint,
-          });
-        }, parseFailure);
-      };
+      if (!recoveryMode) {
+        // Check that at least the first page can be successfully loaded,
+        // since otherwise the XRef table is definitely not valid.
+        await pdfManager.ensureDoc('checkFirstPage');
+      }

-      var parseFailure = function parseFailure(e) {
-        loadDocumentCapability.reject(e);
-      };
-
-      pdfManager.ensureDoc('checkHeader', []).then(function() {
-        pdfManager.ensureDoc('parseStartXRef', []).then(function() {
-          pdfManager.ensureDoc('parse', [recoveryMode]).then(
-            parseSuccess, parseFailure);
-        }, parseFailure);
-      }, parseFailure);
-
-      return loadDocumentCapability.promise;
+      const [numPages, fingerprint] = await Promise.all([
+        pdfManager.ensureDoc('numPages'),
+        pdfManager.ensureDoc('fingerprint'),
+      ]);
+      return { numPages, fingerprint, };
    }

    function getPdfManager(data, evaluatorOptions) {
--- a/src/shared/util.js
+++ b/src/shared/util.js
@ -472,6 +472,18 @@ var MissingDataException = (function MissingDataExceptionClosure() {
  return MissingDataException;
 })();

+const XRefEntryException = (function XRefEntryExceptionClosure() {
+  function XRefEntryException(msg) {
+    this.message = msg;
+  }
+
+  XRefEntryException.prototype = new Error();
+  XRefEntryException.prototype.name = 'XRefEntryException';
+  XRefEntryException.constructor = XRefEntryException;
+
+  return XRefEntryException;
+})();
+
 var XRefParseException = (function XRefParseExceptionClosure() {
  function XRefParseException(msg) {
    this.message = msg;
@ -1033,6 +1045,7 @@ export {
  UnknownErrorException,
  Util,
  toRomanNumerals,
+  XRefEntryException,
  XRefParseException,
  FormatError,
  arrayByteLength,
--- a/test/pdfs/issue10326.pdf.link
+++ b/test/pdfs/issue10326.pdf.link
@ -0,0 +1 @@
+https://github.com/mozilla/pdf.js/files/2643238/test.1.pdf
--- a/test/pdfs/issue7496.pdf.link
+++ b/test/pdfs/issue7496.pdf.link
@ -0,0 +1 @@
+https://github.com/mozilla/pdf.js/files/369694/repro-pdf.pdf
--- a/test/test_manifest.json
+++ b/test/test_manifest.json
@ -1275,6 +1275,22 @@
       "rounds": 1,
       "type": "eq"
    },
+    {  "id": "issue7496",
+       "file": "pdfs/issue7496.pdf",
+       "md5": "b422981ae781166e75c0fb4c3634ed96",
+       "link": true,
+       "rounds": 1,
+       "lastPage": 1,
+       "type": "load"
+    },
+    {  "id": "issue10326",
+       "file": "pdfs/issue10326.pdf",
+       "md5": "015c13b09ef735ea1204f38992c60487",
+       "link": true,
+       "rounds": 1,
+       "lastPage": 1,
+       "type": "load"
+    },
    {  "id": "issue7544",
       "file": "pdfs/issue7544.pdf",
       "md5": "87e3a9fc7d6a6c1bd5b53af6926ce48e",
				`@ -0,0 +1 @@`
				`https://github.com/mozilla/pdf.js/files/2643238/test.1.pdf`