From d0c4bbd8287a58856c04b0e1201f64ddccf57199 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Thu, 25 Nov 2021 18:34:11 +0100 Subject: [PATCH] [api-minor] Validate the /Pages-tree /Count entry during document initialization (issue 14303) *This patch basically extends the approach from PR 10392, by also checking the last page.* Currently, in e.g. the `Catalog.numPages`-getter, we're simply assuming that if the /Pages-tree has an *integer* /Count entry it must also be correct/valid. As can be seen in the referenced PDF documents, that entry may be completely bogus which causes general parsing to breaking down elsewhere in the worker-thread (and hanging the browser). Rather than hoping that the /Count entry is correct, similar to all other data found in PDF documents, we obviously need to validate it. This turns out to be a little less straightforward than one would like, since the only way to do this (as far as I know) is to parse the *entire* /Pages-tree and essentially counting the pages. To avoid doing that for all documents, this patch tries to take a short-cut by checking if the last page (based on the /Count entry) can be successfully fetched. If so, we assume that the /Count entry is correct and use it as-is, otherwise we'll iterate through (potentially) the *entire* /Pages-tree to determine the number of pages. Unfortunately these changes will have a number of *somewhat* negative side-effects, please see a possibly incomplete list below, however I cannot see a better way to address this bug. - This will slow down initial loading/rendering of all documents, at least by some amount, since we now need to fetch/parse more of the /Pages-tree in order to be able to access the *last* page of the PDF documents. - For poorly generated PDF documents, where the entire /Pages-tree only has *one* level, we'll unfortunately need to fetch/parse the *entire* /Pages-tree to get to the last page. While there's a cache to help reduce repeated data lookups, this will affect initial loading/rendering of *some* long PDF documents, - This will affect the `disableAutoFetch = true` mode negatively, since we now need to fetch/parse more data during document initialization. While the `disableAutoFetch = true` mode should still be helpful in larger/longer PDF documents, for smaller ones the effect/usefulness may unfortunately be lost. As one *small* additional bonus, we should now also be able to support opening PDF documents where the /Pages-tree /Count entry is completely invalid (e.g. contains a non-integer value). Fixes two of the issues listed in issue 14303, namely the `poppler-67295-0.pdf` and `poppler-85140-0.pdf` documents. --- src/core/catalog.js | 26 ++++++++++--- src/core/core_utils.js | 7 ++++ src/core/document.js | 69 +++++++++++++++++++++++++++++++++-- src/core/worker.js | 11 +++--- test/pdfs/.gitignore | 2 + test/pdfs/poppler-67295-0.pdf | 40 ++++++++++++++++++++ test/pdfs/poppler-85140-0.pdf | 36 ++++++++++++++++++ test/unit/api_spec.js | 40 +++++++++++++++++++- 8 files changed, 215 insertions(+), 16 deletions(-) create mode 100644 test/pdfs/poppler-67295-0.pdf create mode 100644 test/pdfs/poppler-85140-0.pdf diff --git a/src/core/catalog.js b/src/core/catalog.js index 5bc73d8af..9bad3bd47 100644 --- a/src/core/catalog.js +++ b/src/core/catalog.js @@ -28,6 +28,7 @@ import { import { collectActions, MissingDataException, + PageDictMissingException, recoverJsURL, toRomanNumerals, } from "./core_utils.js"; @@ -70,6 +71,7 @@ class Catalog { if (!isDict(this._catDict)) { throw new FormatError("Catalog object is not a dictionary."); } + this._actualNumPages = null; this.fontCache = new RefSetCache(); this.builtInCMapCache = new Map(); @@ -534,14 +536,26 @@ class Catalog { }; } - get numPages() { + setActualNumPages(num = null) { + this._actualNumPages = num; + } + + get hasActualNumPages() { + return this._actualNumPages !== null; + } + + get _pagesCount() { const obj = this.toplevelPagesDict.get("Count"); if (!Number.isInteger(obj)) { throw new FormatError( "Page count in top-level pages dictionary is not an integer." ); } - return shadow(this, "numPages", obj); + return shadow(this, "_pagesCount", obj); + } + + get numPages() { + return this.hasActualNumPages ? this._actualNumPages : this._pagesCount; } get destinations() { @@ -1071,7 +1085,7 @@ class Catalog { }); } - getPageDict(pageIndex) { + getPageDict(pageIndex, skipCount = false) { const capability = createPromiseCapability(); const nodesToVisit = [this._catDict.getRaw("Pages")]; const visitedNodes = new RefSet(); @@ -1133,7 +1147,7 @@ class Catalog { } count = currentNode.get("Count"); - if (Number.isInteger(count) && count >= 0) { + if (Number.isInteger(count) && count >= 0 && !skipCount) { // Cache the Kids count, since it can reduce redundant lookups in // documents where all nodes are found at *one* level of the tree. const objId = currentNode.objId; @@ -1177,7 +1191,9 @@ class Catalog { nodesToVisit.push(kids[last]); } } - capability.reject(new Error(`Page index ${pageIndex} not found.`)); + capability.reject( + new PageDictMissingException(`Page index ${pageIndex} not found.`) + ); } next(); return capability.promise; diff --git a/src/core/core_utils.js b/src/core/core_utils.js index bea871789..0fd7f3014 100644 --- a/src/core/core_utils.js +++ b/src/core/core_utils.js @@ -60,6 +60,12 @@ class MissingDataException extends BaseException { } } +class PageDictMissingException extends BaseException { + constructor(msg) { + super(msg, "PageDictMissingException"); + } +} + class ParserEOFException extends BaseException { constructor(msg) { super(msg, "ParserEOFException"); @@ -541,6 +547,7 @@ export { isWhiteSpace, log2, MissingDataException, + PageDictMissingException, ParserEOFException, parseXFAPath, readInt8, diff --git a/src/core/document.js b/src/core/document.js index 3cee33389..bc33b1952 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -50,6 +50,7 @@ import { getInheritableProperty, isWhiteSpace, MissingDataException, + PageDictMissingException, validateCSSFont, XRefEntryException, XRefParseException, @@ -787,7 +788,9 @@ class PDFDocument { get numPages() { let num = 0; - if (this.xfaFactory) { + if (this.catalog.hasActualNumPages) { + num = this.catalog.numPages; + } else if (this.xfaFactory) { // num is a Promise. num = this.xfaFactory.getNumPages(); } else if (this.linearization) { @@ -1331,8 +1334,13 @@ class PDFDocument { return promise; } - checkFirstPage() { - return this.getPage(0).catch(async reason => { + async checkFirstPage(recoveryMode = false) { + if (recoveryMode) { + return; + } + try { + await this.getPage(0); + } catch (reason) { if (reason instanceof XRefEntryException) { // Clear out the various caches to ensure that we haven't stored any // inconsistent and/or incorrect state, since that could easily break @@ -1342,7 +1350,60 @@ class PDFDocument { throw new XRefParseException(); } - }); + } + } + + async checkLastPage(recoveryMode = false) { + this.catalog.setActualNumPages(); // Ensure that it's always reset. + let numPages; + + try { + await Promise.all([ + this.pdfManager.ensureDoc("xfaFactory"), + this.pdfManager.ensureDoc("linearization"), + this.pdfManager.ensureCatalog("numPages"), + ]); + + if (this.xfaFactory) { + return; // The Page count is always calculated for XFA-documents. + } else if (this.linearization) { + numPages = this.linearization.numPages; + } else { + numPages = this.catalog.numPages; + } + + if (numPages === 1) { + return; + } else if (!Number.isInteger(numPages)) { + throw new FormatError("Page count is not an integer."); + } + await this.getPage(numPages - 1); + } catch (reason) { + warn(`checkLastPage - invalid /Pages tree /Count: ${numPages}.`); + // Clear out the various caches to ensure that we haven't stored any + // inconsistent and/or incorrect state, since that could easily break + // subsequent `this.getPage` calls. + await this.cleanup(); + + let pageIndex = 1; // The first page was already loaded. + while (true) { + try { + await this.getPage(pageIndex, /* skipCount = */ true); + } catch (reasonLoop) { + if (reasonLoop instanceof PageDictMissingException) { + break; + } + if (reasonLoop instanceof XRefEntryException) { + if (!recoveryMode) { + throw new XRefParseException(); + } + break; + } + } + pageIndex++; + } + this.catalog.setActualNumPages(pageIndex); + } } fontFallback(id, handler) { diff --git a/src/core/worker.js b/src/core/worker.js index f6b41df99..24694c0a8 100644 --- a/src/core/worker.js +++ b/src/core/worker.js @@ -170,11 +170,12 @@ class WorkerMessageHandler { await pdfManager.ensureDoc("parseStartXRef"); await pdfManager.ensureDoc("parse", [recoveryMode]); - if (!recoveryMode) { - // Check that at least the first page can be successfully loaded, - // since otherwise the XRef table is definitely not valid. - await pdfManager.ensureDoc("checkFirstPage"); - } + // Check that at least the first page can be successfully loaded, + // since otherwise the XRef table is definitely not valid. + await pdfManager.ensureDoc("checkFirstPage", [recoveryMode]); + // Check that the last page can be sucessfully loaded, to ensure that + // `numPages` is correct, and fallback to walking the entire /Pages-tree. + await pdfManager.ensureDoc("checkLastPage", [recoveryMode]); const isPureXfa = await pdfManager.ensureDoc("isPureXfa"); if (isPureXfa) { diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 889b2a104..c4fcf9d30 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -490,3 +490,5 @@ !PDFBOX-4352-0.pdf !REDHAT-1531897-0.pdf !xfa_issue14315.pdf +!poppler-67295-0.pdf +!poppler-85140-0.pdf diff --git a/test/pdfs/poppler-67295-0.pdf b/test/pdfs/poppler-67295-0.pdf new file mode 100644 index 000000000..eb54bf85d --- /dev/null +++ b/test/pdfs/poppler-67295-0.pdf @@ -0,0 +1,40 @@ +%PDF-1.2 +1 0 obj +<> +endobj +2 0 obj +<> +endobj +3 0 obj +<> +stream +BT /F1 24 Tf 20 750 Td (TestString123) Tj ET +endstream +endobj +4 0 obj +[/PDF /Text] +endobj +5 0 obj +<> +endobj +6 0 obj +<> +endobj +7 0 obj +<>>>>> +endobj +xref +0 8 +0000000000 65535 f +0000000010 00000 n +0000000076 00000 n +0000000123 00000 n +0000000221 00000 n +0000000252 00000 n +0000000361 00000 n +0000000428 00000 n +trailer +<> +startxref +566 +%%EOF diff --git a/test/pdfs/poppler-85140-0.pdf b/test/pdfs/poppler-85140-0.pdf new file mode 100644 index 000000000..5ae8023b1 --- /dev/null +++ b/test/pdfs/poppler-85140-0.pdf @@ -0,0 +1,36 @@ +%PDF-1.4 +1 0 obj +<< + /Type /Catalog + /Pages 2 0 R +>> +endobj +2 0 obj +<< + /Type /Pages + /Kids [3 0 R] + /Count 213804087 +>> +endobj +3 18446744073709551616 obj +<< + /Type /Page + /Parent 2 0 R + /MediaBox [0 0 595 2147483647] + /Contents 4 0 R +>> +endobj +4 233245 obj +<< +>> +stream +endstream +endobj +5 0 obj +>> +endobj +trailer +<< + /Root 1 0 R +>> +%%EOF diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index f61d19a35..c8d98acf3 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -25,6 +25,7 @@ import { PasswordResponses, PermissionFlag, StreamType, + UnknownErrorException, } from "../../src/shared/util.js"; import { buildGetDocumentParams, @@ -478,6 +479,38 @@ describe("api", function () { await loadingTask.destroy(); }); + + it("creates pdf doc from PDF files, with bad /Pages tree /Count", async function () { + const loadingTask1 = getDocument( + buildGetDocumentParams("poppler-67295-0.pdf") + ); + const loadingTask2 = getDocument( + buildGetDocumentParams("poppler-85140-0.pdf") + ); + expect(loadingTask1 instanceof PDFDocumentLoadingTask).toEqual(true); + expect(loadingTask2 instanceof PDFDocumentLoadingTask).toEqual(true); + + const pdfDocument1 = await loadingTask1.promise; + const pdfDocument2 = await loadingTask2.promise; + + expect(pdfDocument1.numPages).toEqual(1); + expect(pdfDocument2.numPages).toEqual(1); + + const pageA = await pdfDocument1.getPage(1); + expect(pageA instanceof PDFPageProxy).toEqual(true); + + try { + await pdfDocument2.getPage(1); + + // Shouldn't get here. + expect(false).toEqual(true); + } catch (reason) { + expect(reason instanceof UnknownErrorException).toEqual(true); + expect(reason.message).toEqual("Bad (uncompressed) XRef entry: 3R"); + } + + await Promise.all([loadingTask1.destroy(), loadingTask2.destroy()]); + }); }); describe("PDFWorker", function () { @@ -683,7 +716,7 @@ describe("api", function () { throw new Error("shall fail for invalid page"); }, function (reason) { - expect(reason instanceof Error).toEqual(true); + expect(reason instanceof UnknownErrorException).toEqual(true); expect(reason.message).toEqual( "Pages tree contains circular reference." ); @@ -724,7 +757,10 @@ describe("api", function () { // Shouldn't get here. expect(false).toEqual(true); } catch (reason) { - expect(reason instanceof Error).toEqual(true); + expect(reason instanceof UnknownErrorException).toEqual(true); + expect(reason.message).toEqual( + "The reference does not point to a /Page dictionary." + ); } });