From b0e774d9c5c7609c44c2df9bfb0b40075c0d5bc9 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Fri, 31 Dec 2021 14:57:01 +0100 Subject: [PATCH] Convert `Catalog.getAllPageDicts` to an `async` method The patch in PR 14335 *essentially* re-introduced the old code from before PR 3848, however looking at this code a bit closer it should be possible to simplify it by making the method asynchronous. While this method is currently only used as a *fallback* in corrupt documents, the way that `MissingDataException`s are handled is less than ideal. Note that if a `MissingDataException` is thrown, we're forced to re-parse the *entire* /Pages tree[1]. With this method now being asynchronous, we're able to handle fetching of References in a *much* easier/nicer way than before without having to throw `MissingDataException`s and re-parse anything. These changes also let us simplify the call-site slightly, by calling the method *directly* instead of using the `PDFManager`-instance (since again it will no longer throw `MissingDataException`s). Furthermore, this patch contains the following other changes: - Reduce unnecessary duplication in the various `catch` handlers throughout the method, by simply moving the `XRefEntryException` handling into the `addPageError` helper function instead. - Move the "circular references"-check to occur slightly earlier, since there's obviously no point in asynchronously fetching data just to then throw an Error *immediately* afterwards. --- [1] Imagine e.g. a thousand page document, where there's a `MissingDataException` thrown when fetching/parsing page 900. --- src/core/catalog.js | 64 +++++++++++++++++++------------------------- src/core/document.js | 4 +-- 2 files changed, 29 insertions(+), 39 deletions(-) diff --git a/src/core/catalog.js b/src/core/catalog.js index 30ccddf16..044979a9f 100644 --- a/src/core/catalog.js +++ b/src/core/catalog.js @@ -1210,9 +1210,9 @@ class Catalog { /** * Eagerly fetches the entire /Pages-tree; should ONLY be used as a fallback. - * @returns {Map} + * @returns {Promise} */ - getAllPageDicts(recoveryMode = false) { + async getAllPageDicts(recoveryMode = false) { const queue = [{ currentNode: this.toplevelPagesDict, posInKids: 0 }]; const visitedNodes = new RefSet(); @@ -1221,6 +1221,7 @@ class Catalog { visitedNodes.put(pagesRef); } const map = new Map(), + xref = this.xref, pageIndexCache = this.pageIndexCache; let pageIndex = 0; @@ -1233,6 +1234,10 @@ class Catalog { map.set(pageIndex++, [pageDict, pageRef]); } function addPageError(error) { + if (error instanceof XRefEntryException && !recoveryMode) { + throw error; + } + map.set(pageIndex++, [error, null]); } @@ -1240,18 +1245,14 @@ class Catalog { const queueItem = queue[queue.length - 1]; const { currentNode, posInKids } = queueItem; - let kids; - try { - kids = currentNode.get("Kids"); - } catch (ex) { - if (ex instanceof MissingDataException) { - throw ex; + let kids = currentNode.getRaw("Kids"); + if (kids instanceof Ref) { + try { + kids = await xref.fetchAsync(kids); + } catch (ex) { + addPageError(ex); + break; } - if (ex instanceof XRefEntryException && !recoveryMode) { - throw ex; - } - addPageError(ex); - break; } if (!Array.isArray(kids)) { addPageError( @@ -1268,18 +1269,6 @@ class Catalog { const kidObj = kids[posInKids]; let obj; if (kidObj instanceof Ref) { - try { - obj = this.xref.fetch(kidObj); - } catch (ex) { - if (ex instanceof MissingDataException) { - throw ex; - } - if (ex instanceof XRefEntryException && !recoveryMode) { - throw ex; - } - addPageError(ex); - break; - } // Prevent circular references in the /Pages tree. if (visitedNodes.has(kidObj)) { addPageError( @@ -1288,6 +1277,13 @@ class Catalog { break; } visitedNodes.put(kidObj); + + try { + obj = await xref.fetchAsync(kidObj); + } catch (ex) { + addPageError(ex); + break; + } } else { // Prevent errors in corrupt PDF documents that violate the // specification by *inlining* Page dicts directly in the Kids @@ -1303,18 +1299,14 @@ class Catalog { break; } - let type; - try { - type = obj.get("Type"); - } catch (ex) { - if (ex instanceof MissingDataException) { - throw ex; + let type = obj.getRaw("Type"); + if (type instanceof Ref) { + try { + type = await xref.fetchAsync(type); + } catch (ex) { + addPageError(ex); + break; } - if (ex instanceof XRefEntryException && !recoveryMode) { - throw ex; - } - addPageError(ex); - break; } if (isName(type, "Page") || !obj.has("Kids")) { addPageDict(obj, kidObj instanceof Ref ? kidObj : null); diff --git a/src/core/document.js b/src/core/document.js index 06b3f3895..8d113abf8 100644 --- a/src/core/document.js +++ b/src/core/document.js @@ -1401,9 +1401,7 @@ class PDFDocument { let pagesTree; try { - pagesTree = await pdfManager.ensureCatalog("getAllPageDicts", [ - recoveryMode, - ]); + pagesTree = await catalog.getAllPageDicts(recoveryMode); } catch (reasonAll) { if (reasonAll instanceof XRefEntryException && !recoveryMode) { throw new XRefParseException();