Convert Catalog.getAllPageDicts to an async method

The patch in PR 14335 *essentially* re-introduced the old code from before PR 3848, however looking at this code a bit closer it should be possible to simplify it by making the method asynchronous.

While this method is currently only used as a *fallback* in corrupt documents, the way that `MissingDataException`s are handled is less than ideal. Note that if a `MissingDataException` is thrown, we're forced to re-parse the *entire* /Pages tree[1].
With this method now being asynchronous, we're able to handle fetching of References in a *much* easier/nicer way than before without having to throw `MissingDataException`s and re-parse anything.
These changes also let us simplify the call-site slightly, by calling the method *directly* instead of using the `PDFManager`-instance (since again it will no longer throw `MissingDataException`s).

Furthermore, this patch contains the following other changes:
 - Reduce unnecessary duplication in the various `catch` handlers throughout the method, by simply moving the `XRefEntryException` handling into the `addPageError` helper function instead.
 - Move the "circular references"-check to occur slightly earlier, since there's obviously no point in asynchronously fetching data just to then throw an Error *immediately* afterwards.

---
[1] Imagine e.g. a thousand page document, where there's a `MissingDataException` thrown when fetching/parsing page 900.
This commit is contained in:
Jonas Jenwald 2021-12-31 14:57:01 +01:00
parent 3d7bb6c38d
commit b0e774d9c5
2 changed files with 29 additions and 39 deletions

View File

@ -1210,9 +1210,9 @@ class Catalog {
/**
* Eagerly fetches the entire /Pages-tree; should ONLY be used as a fallback.
* @returns {Map}
* @returns {Promise<Map>}
*/
getAllPageDicts(recoveryMode = false) {
async getAllPageDicts(recoveryMode = false) {
const queue = [{ currentNode: this.toplevelPagesDict, posInKids: 0 }];
const visitedNodes = new RefSet();
@ -1221,6 +1221,7 @@ class Catalog {
visitedNodes.put(pagesRef);
}
const map = new Map(),
xref = this.xref,
pageIndexCache = this.pageIndexCache;
let pageIndex = 0;
@ -1233,6 +1234,10 @@ class Catalog {
map.set(pageIndex++, [pageDict, pageRef]);
}
function addPageError(error) {
if (error instanceof XRefEntryException && !recoveryMode) {
throw error;
}
map.set(pageIndex++, [error, null]);
}
@ -1240,18 +1245,14 @@ class Catalog {
const queueItem = queue[queue.length - 1];
const { currentNode, posInKids } = queueItem;
let kids;
try {
kids = currentNode.get("Kids");
} catch (ex) {
if (ex instanceof MissingDataException) {
throw ex;
let kids = currentNode.getRaw("Kids");
if (kids instanceof Ref) {
try {
kids = await xref.fetchAsync(kids);
} catch (ex) {
addPageError(ex);
break;
}
if (ex instanceof XRefEntryException && !recoveryMode) {
throw ex;
}
addPageError(ex);
break;
}
if (!Array.isArray(kids)) {
addPageError(
@ -1268,18 +1269,6 @@ class Catalog {
const kidObj = kids[posInKids];
let obj;
if (kidObj instanceof Ref) {
try {
obj = this.xref.fetch(kidObj);
} catch (ex) {
if (ex instanceof MissingDataException) {
throw ex;
}
if (ex instanceof XRefEntryException && !recoveryMode) {
throw ex;
}
addPageError(ex);
break;
}
// Prevent circular references in the /Pages tree.
if (visitedNodes.has(kidObj)) {
addPageError(
@ -1288,6 +1277,13 @@ class Catalog {
break;
}
visitedNodes.put(kidObj);
try {
obj = await xref.fetchAsync(kidObj);
} catch (ex) {
addPageError(ex);
break;
}
} else {
// Prevent errors in corrupt PDF documents that violate the
// specification by *inlining* Page dicts directly in the Kids
@ -1303,18 +1299,14 @@ class Catalog {
break;
}
let type;
try {
type = obj.get("Type");
} catch (ex) {
if (ex instanceof MissingDataException) {
throw ex;
let type = obj.getRaw("Type");
if (type instanceof Ref) {
try {
type = await xref.fetchAsync(type);
} catch (ex) {
addPageError(ex);
break;
}
if (ex instanceof XRefEntryException && !recoveryMode) {
throw ex;
}
addPageError(ex);
break;
}
if (isName(type, "Page") || !obj.has("Kids")) {
addPageDict(obj, kidObj instanceof Ref ? kidObj : null);

View File

@ -1401,9 +1401,7 @@ class PDFDocument {
let pagesTree;
try {
pagesTree = await pdfManager.ensureCatalog("getAllPageDicts", [
recoveryMode,
]);
pagesTree = await catalog.getAllPageDicts(recoveryMode);
} catch (reasonAll) {
if (reasonAll instanceof XRefEntryException && !recoveryMode) {
throw new XRefParseException();