[Regression] Eagerly fetch/parse the entire /Pages-tree in corrupt documents (issue 14303, PR 14311 follow-up)

*Please note:* This is similar to the method that existed prior to PR 3848, but the new method will *only* be used as a fallback when parsing of corrupt PDF documents.

The implementation in PR 14311 unfortunately turned out to be *way* too simplistic, as evident by the recently added test-files in issue 14303, since it may *cause* infinite loops in `PDFDocument.checkLastPage` for some corrupt PDF documents.[1]
To avoid this, the easiest solution that I could come up with was to fallback to eagerly parsing the *entire* /Pages-tree when the /Count-entry validation fails during document initialization.

Fixes *at least* two of the issues listed in issue 14303, namely the `poppler-395-0.pdf...` and `GHOSTSCRIPT-698804-1.pdf...` documents.

---
[1] The whole point of PR 14311 was obviously to *get rid of* infinte loops during document initialization, not to introduce any more of those.
This commit is contained in:
Jonas Jenwald 2021-12-02 01:40:52 +01:00
parent f61b74e38e
commit 1fac6371d3
7 changed files with 504 additions and 35 deletions

View File

@ -22,15 +22,16 @@ import {
isRefsEqual, isRefsEqual,
isStream, isStream,
Name, Name,
Ref,
RefSet, RefSet,
RefSetCache, RefSetCache,
} from "./primitives.js"; } from "./primitives.js";
import { import {
collectActions, collectActions,
MissingDataException, MissingDataException,
PageDictMissingException,
recoverJsURL, recoverJsURL,
toRomanNumerals, toRomanNumerals,
XRefEntryException,
} from "./core_utils.js"; } from "./core_utils.js";
import { import {
createPromiseCapability, createPromiseCapability,
@ -1212,14 +1213,96 @@ class Catalog {
nodesToVisit.push(kids[last]); nodesToVisit.push(kids[last]);
} }
} }
capability.reject( capability.reject(new Error(`Page index ${pageIndex} not found.`));
new PageDictMissingException(`Page index ${pageIndex} not found.`)
);
} }
next(); next();
return capability.promise; return capability.promise;
} }
/**
* Eagerly fetches the entire /Pages-tree; should ONLY be used as a fallback.
* @returns {Map}
*/
getAllPageDicts() {
const queue = [{ currentNode: this.toplevelPagesDict, posInKids: 0 }];
const visitedNodes = new RefSet();
const map = new Map();
let pageIndex = 0;
function addPageDict(pageDict, pageRef) {
map.set(pageIndex++, [pageDict, pageRef]);
}
function addPageError(msg) {
map.set(pageIndex++, [new FormatError(msg), null]);
}
while (queue.length > 0) {
const queueItem = queue[queue.length - 1];
const { currentNode, posInKids } = queueItem;
let kids;
try {
kids = currentNode.get("Kids");
} catch (ex) {
if (ex instanceof MissingDataException) {
throw ex;
}
if (ex instanceof XRefEntryException) {
throw ex;
}
}
if (!Array.isArray(kids)) {
addPageError("Page dictionary kids object is not an array.");
break;
}
if (posInKids >= kids.length) {
queue.pop();
continue;
}
const kidObj = kids[posInKids];
let obj;
if (kidObj instanceof Ref) {
try {
obj = this.xref.fetch(kidObj);
} catch (ex) {
if (ex instanceof MissingDataException) {
throw ex;
}
if (ex instanceof XRefEntryException) {
throw ex;
}
}
// Prevent circular references in the /Pages tree.
if (visitedNodes.has(kidObj)) {
addPageError("Pages tree contains circular reference.");
break;
}
visitedNodes.put(kidObj);
} else {
// Prevent errors in corrupt PDF documents that violate the
// specification by *inlining* Page dicts directly in the Kids
// array, rather than using indirect objects (see issue9540.pdf).
obj = kidObj;
}
if (!(obj instanceof Dict)) {
addPageError(
"Page dictionary kid reference points to wrong type of object."
);
break;
}
if (isDict(obj, "Page") || !obj.has("Kids")) {
addPageDict(obj, kidObj instanceof Ref ? kidObj : null);
} else {
queue.push({ currentNode: obj, posInKids: 0 });
}
queueItem.posInKids++;
}
return map;
}
getPageIndex(pageRef) { getPageIndex(pageRef) {
const cachedPageIndex = this.pageIndexCache.get(pageRef); const cachedPageIndex = this.pageIndexCache.get(pageRef);
if (cachedPageIndex !== undefined) { if (cachedPageIndex !== undefined) {

View File

@ -60,12 +60,6 @@ class MissingDataException extends BaseException {
} }
} }
class PageDictMissingException extends BaseException {
constructor(msg) {
super(msg, "PageDictMissingException");
}
}
class ParserEOFException extends BaseException { class ParserEOFException extends BaseException {
constructor(msg) { constructor(msg) {
super(msg, "ParserEOFException"); super(msg, "ParserEOFException");
@ -547,7 +541,6 @@ export {
isWhiteSpace, isWhiteSpace,
log2, log2,
MissingDataException, MissingDataException,
PageDictMissingException,
ParserEOFException, ParserEOFException,
parseXFAPath, parseXFAPath,
readInt8, readInt8,

View File

@ -50,7 +50,6 @@ import {
getInheritableProperty, getInheritableProperty,
isWhiteSpace, isWhiteSpace,
MissingDataException, MissingDataException,
PageDictMissingException,
validateCSSFont, validateCSSFont,
XRefEntryException, XRefEntryException,
XRefParseException, XRefParseException,
@ -1354,14 +1353,16 @@ class PDFDocument {
} }
async checkLastPage(recoveryMode = false) { async checkLastPage(recoveryMode = false) {
this.catalog.setActualNumPages(); // Ensure that it's always reset. const { catalog, pdfManager } = this;
catalog.setActualNumPages(); // Ensure that it's always reset.
let numPages; let numPages;
try { try {
await Promise.all([ await Promise.all([
this.pdfManager.ensureDoc("xfaFactory"), pdfManager.ensureDoc("xfaFactory"),
this.pdfManager.ensureDoc("linearization"), pdfManager.ensureDoc("linearization"),
this.pdfManager.ensureCatalog("numPages"), pdfManager.ensureCatalog("numPages"),
]); ]);
if (this.xfaFactory) { if (this.xfaFactory) {
@ -1369,13 +1370,13 @@ class PDFDocument {
} else if (this.linearization) { } else if (this.linearization) {
numPages = this.linearization.numPages; numPages = this.linearization.numPages;
} else { } else {
numPages = this.catalog.numPages; numPages = catalog.numPages;
} }
if (numPages === 1) { if (!Number.isInteger(numPages)) {
return;
} else if (!Number.isInteger(numPages)) {
throw new FormatError("Page count is not an integer."); throw new FormatError("Page count is not an integer.");
} else if (numPages <= 1) {
return;
} }
await this.getPage(numPages - 1); await this.getPage(numPages - 1);
} catch (reason) { } catch (reason) {
@ -1385,24 +1386,48 @@ class PDFDocument {
// subsequent `this.getPage` calls. // subsequent `this.getPage` calls.
await this.cleanup(); await this.cleanup();
let pageIndex = 1; // The first page was already loaded. let pagesTree;
while (true) { try {
try { pagesTree = await pdfManager.ensureCatalog("getAllPageDicts");
await this.getPage(pageIndex); } catch (reasonAll) {
} catch (reasonLoop) { if (reasonAll instanceof XRefEntryException) {
if (reasonLoop instanceof PageDictMissingException) { if (!recoveryMode) {
break; throw new XRefParseException();
}
if (reasonLoop instanceof XRefEntryException) {
if (!recoveryMode) {
throw new XRefParseException();
}
break;
} }
} }
pageIndex++; catalog.setActualNumPages(1);
return;
} }
this.catalog.setActualNumPages(pageIndex);
for (const [pageIndex, [pageDict, ref]] of pagesTree) {
let promise;
if (pageDict instanceof Error) {
promise = Promise.reject(pageDict);
// Prevent "uncaught exception: Object"-messages in the console.
promise.catch(() => {});
} else {
promise = Promise.resolve(
new Page({
pdfManager,
xref: this.xref,
pageIndex,
pageDict,
ref,
globalIdFactory: this._globalIdFactory,
fontCache: catalog.fontCache,
builtInCMapCache: catalog.builtInCMapCache,
standardFontDataCache: catalog.standardFontDataCache,
globalImageCache: catalog.globalImageCache,
nonBlendModesSet: catalog.nonBlendModesSet,
xfaFactory: null,
})
);
}
this._pagePromises.set(pageIndex, promise);
}
catalog.setActualNumPages(pagesTree.size);
} }
} }

View File

@ -492,6 +492,8 @@
!xfa_issue14315.pdf !xfa_issue14315.pdf
!poppler-67295-0.pdf !poppler-67295-0.pdf
!poppler-85140-0.pdf !poppler-85140-0.pdf
!poppler-395-0-fuzzed.pdf
!GHOSTSCRIPT-698804-1-fuzzed.pdf
!poppler-91414-0-53.pdf !poppler-91414-0-53.pdf
!poppler-91414-0-54.pdf !poppler-91414-0-54.pdf
!poppler-742-0-fuzzed.pdf !poppler-742-0-fuzzed.pdf

View File

@ -0,0 +1,69 @@
%PDF-1.4
%âãÏÓ
1 0 obj
<<
/Type /Catalog
/Outline 2 0 R
/Pages 3 0 R
>>
endobj
2 0 obj
<<
/Type /Outlines
/Count 0
>>
endobj
3 0 obj
<<
/Type /Pages
/Kids [ 4 0 R ]
/Count 1
>>
endobj
4 0 obj
<<
/Type /Page
/Parent 3 0 R
/MediaBox [ 0 0 612 792 ]
/Contents 5 0 R
/Resources <<
/ProcSet 6 0 R
>>
>>
endobj
5 0 obj
<<
/Length 0
>>
stream
endstream
endobj
6 0 obj
[ /PDF ]
endobj
xref
0 2
0000000000 65536 f
0000000016 00000 n
00000004294967296 3
0000000138 00000 n
0000000204 00000 n
0000000342 00000 n
trailer
<<
/Size 7
/Root 1 0 R
>>
startxref
418
%%EOF

File diff suppressed because one or more lines are too long

View File

@ -495,14 +495,27 @@ describe("api", function () {
const loadingTask2 = getDocument( const loadingTask2 = getDocument(
buildGetDocumentParams("poppler-85140-0.pdf") buildGetDocumentParams("poppler-85140-0.pdf")
); );
const loadingTask3 = getDocument(
buildGetDocumentParams("poppler-395-0-fuzzed.pdf")
);
const loadingTask4 = getDocument(
buildGetDocumentParams("GHOSTSCRIPT-698804-1-fuzzed.pdf")
);
expect(loadingTask1 instanceof PDFDocumentLoadingTask).toEqual(true); expect(loadingTask1 instanceof PDFDocumentLoadingTask).toEqual(true);
expect(loadingTask2 instanceof PDFDocumentLoadingTask).toEqual(true); expect(loadingTask2 instanceof PDFDocumentLoadingTask).toEqual(true);
expect(loadingTask3 instanceof PDFDocumentLoadingTask).toEqual(true);
expect(loadingTask4 instanceof PDFDocumentLoadingTask).toEqual(true);
const pdfDocument1 = await loadingTask1.promise; const pdfDocument1 = await loadingTask1.promise;
const pdfDocument2 = await loadingTask2.promise; const pdfDocument2 = await loadingTask2.promise;
const pdfDocument3 = await loadingTask3.promise;
const pdfDocument4 = await loadingTask4.promise;
expect(pdfDocument1.numPages).toEqual(1); expect(pdfDocument1.numPages).toEqual(1);
expect(pdfDocument2.numPages).toEqual(1); expect(pdfDocument2.numPages).toEqual(1);
expect(pdfDocument3.numPages).toEqual(1);
expect(pdfDocument4.numPages).toEqual(1);
const pageA = await pdfDocument1.getPage(1); const pageA = await pdfDocument1.getPage(1);
expect(pageA instanceof PDFPageProxy).toEqual(true); expect(pageA instanceof PDFPageProxy).toEqual(true);
@ -516,6 +529,28 @@ describe("api", function () {
expect(reason instanceof UnknownErrorException).toEqual(true); expect(reason instanceof UnknownErrorException).toEqual(true);
expect(reason.message).toEqual("Bad (uncompressed) XRef entry: 3R"); expect(reason.message).toEqual("Bad (uncompressed) XRef entry: 3R");
} }
try {
await pdfDocument3.getPage(1);
// Shouldn't get here.
expect(false).toEqual(true);
} catch (reason) {
expect(reason instanceof UnknownErrorException).toEqual(true);
expect(reason.message).toEqual(
"Page dictionary kid reference points to wrong type of object."
);
}
try {
await pdfDocument4.getPage(1);
// Shouldn't get here.
expect(false).toEqual(true);
} catch (reason) {
expect(reason instanceof UnknownErrorException).toEqual(true);
expect(reason.message).toEqual(
"Page dictionary kid reference points to wrong type of object."
);
}
await Promise.all([loadingTask1.destroy(), loadingTask2.destroy()]); await Promise.all([loadingTask1.destroy(), loadingTask2.destroy()]);
}); });