From 47f9eef584390b1a5e7a5e923fbfe425e9560e35 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Fri, 10 Dec 2021 11:45:09 +0100 Subject: [PATCH 1/2] Improve `PDFDocument.checkLastPage` for documents with corrupt XRef tables (PR 14311, 14335 follow-up) Rather than trying, and failing, to fetch the entire /Pages-tree for documents with corrupt XRef tables, let's fallback to indexing all objects *before* trying to invoke the `Catalog.getAllPageDicts` method. --- src/core/document.js | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/core/document.js b/src/core/document.js index 0706eb888..5d7768a52 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -1344,7 +1344,7 @@ class PDFDocument { // Clear out the various caches to ensure that we haven't stored any // inconsistent and/or incorrect state, since that could easily break // subsequent `this.getPage` calls. - this._pagePromises.clear(); + this._pagePromises.delete(0); await this.cleanup(); throw new XRefParseException(); @@ -1380,20 +1380,23 @@ class PDFDocument { } await this.getPage(numPages - 1); } catch (reason) { - warn(`checkLastPage - invalid /Pages tree /Count: ${numPages}.`); // Clear out the various caches to ensure that we haven't stored any // inconsistent and/or incorrect state, since that could easily break // subsequent `this.getPage` calls. + this._pagePromises.delete(numPages - 1); await this.cleanup(); + if (reason instanceof XRefEntryException && !recoveryMode) { + throw new XRefParseException(); + } + warn(`checkLastPage - invalid /Pages tree /Count: ${numPages}.`); + let pagesTree; try { pagesTree = await pdfManager.ensureCatalog("getAllPageDicts"); } catch (reasonAll) { - if (reasonAll instanceof XRefEntryException) { - if (!recoveryMode) { - throw new XRefParseException(); - } + if (reasonAll instanceof XRefEntryException && !recoveryMode) { + throw new XRefParseException(); } catalog.setActualNumPages(1); return; From 70ac6b1694db8f8596dd3b22c97b6412f64955ad Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Fri, 10 Dec 2021 13:54:34 +0100 Subject: [PATCH 2/2] Update `Catalog.getAllPageDicts` to always propagate the actual Errors (PR 14335 follow-up) Rather than "swallowing" the actual Errors, when data fetching fails, ensure that they're always being propagated as intended to the call-site instead. Note that we purposely handle `XRefEntryException` specially, to make it possible to fallback to indexing all XRef objects. --- src/core/catalog.js | 26 ++++++++++++++++++-------- src/core/document.js | 4 +++- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/src/core/catalog.js b/src/core/catalog.js index 5a6296557..ec885c5ce 100644 --- a/src/core/catalog.js +++ b/src/core/catalog.js @@ -1224,7 +1224,7 @@ class Catalog { * Eagerly fetches the entire /Pages-tree; should ONLY be used as a fallback. * @returns {Map} */ - getAllPageDicts() { + getAllPageDicts(recoveryMode = false) { const queue = [{ currentNode: this.toplevelPagesDict, posInKids: 0 }]; const visitedNodes = new RefSet(); const map = new Map(); @@ -1233,8 +1233,8 @@ class Catalog { function addPageDict(pageDict, pageRef) { map.set(pageIndex++, [pageDict, pageRef]); } - function addPageError(msg) { - map.set(pageIndex++, [new FormatError(msg), null]); + function addPageError(error) { + map.set(pageIndex++, [error, null]); } while (queue.length > 0) { @@ -1248,12 +1248,16 @@ class Catalog { if (ex instanceof MissingDataException) { throw ex; } - if (ex instanceof XRefEntryException) { + if (ex instanceof XRefEntryException && !recoveryMode) { throw ex; } + addPageError(ex); + break; } if (!Array.isArray(kids)) { - addPageError("Page dictionary kids object is not an array."); + addPageError( + new FormatError("Page dictionary kids object is not an array.") + ); break; } @@ -1271,13 +1275,17 @@ class Catalog { if (ex instanceof MissingDataException) { throw ex; } - if (ex instanceof XRefEntryException) { + if (ex instanceof XRefEntryException && !recoveryMode) { throw ex; } + addPageError(ex); + break; } // Prevent circular references in the /Pages tree. if (visitedNodes.has(kidObj)) { - addPageError("Pages tree contains circular reference."); + addPageError( + new FormatError("Pages tree contains circular reference.") + ); break; } visitedNodes.put(kidObj); @@ -1289,7 +1297,9 @@ class Catalog { } if (!(obj instanceof Dict)) { addPageError( - "Page dictionary kid reference points to wrong type of object." + new FormatError( + "Page dictionary kid reference points to wrong type of object." + ) ); break; } diff --git a/src/core/document.js b/src/core/document.js index 5d7768a52..d2238cae9 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -1393,7 +1393,9 @@ class PDFDocument { let pagesTree; try { - pagesTree = await pdfManager.ensureCatalog("getAllPageDicts"); + pagesTree = await pdfManager.ensureCatalog("getAllPageDicts", [ + recoveryMode, + ]); } catch (reasonAll) { if (reasonAll instanceof XRefEntryException && !recoveryMode) { throw new XRefParseException();