[Regression] Eagerly fetch/parse the entire /Pages-tree in corrupt documents (issue 14303, PR 14311 follow-up)
*Please note:* This is similar to the method that existed prior to PR 3848, but the new method will *only* be used as a fallback when parsing of corrupt PDF documents. The implementation in PR 14311 unfortunately turned out to be *way* too simplistic, as evident by the recently added test-files in issue 14303, since it may *cause* infinite loops in `PDFDocument.checkLastPage` for some corrupt PDF documents.[1] To avoid this, the easiest solution that I could come up with was to fallback to eagerly parsing the *entire* /Pages-tree when the /Count-entry validation fails during document initialization. Fixes *at least* two of the issues listed in issue 14303, namely the `poppler-395-0.pdf...` and `GHOSTSCRIPT-698804-1.pdf...` documents. --- [1] The whole point of PR 14311 was obviously to *get rid of* infinte loops during document initialization, not to introduce any more of those.
This commit is contained in:
parent
f61b74e38e
commit
1fac6371d3
@ -22,15 +22,16 @@ import {
|
|||||||
isRefsEqual,
|
isRefsEqual,
|
||||||
isStream,
|
isStream,
|
||||||
Name,
|
Name,
|
||||||
|
Ref,
|
||||||
RefSet,
|
RefSet,
|
||||||
RefSetCache,
|
RefSetCache,
|
||||||
} from "./primitives.js";
|
} from "./primitives.js";
|
||||||
import {
|
import {
|
||||||
collectActions,
|
collectActions,
|
||||||
MissingDataException,
|
MissingDataException,
|
||||||
PageDictMissingException,
|
|
||||||
recoverJsURL,
|
recoverJsURL,
|
||||||
toRomanNumerals,
|
toRomanNumerals,
|
||||||
|
XRefEntryException,
|
||||||
} from "./core_utils.js";
|
} from "./core_utils.js";
|
||||||
import {
|
import {
|
||||||
createPromiseCapability,
|
createPromiseCapability,
|
||||||
@ -1212,14 +1213,96 @@ class Catalog {
|
|||||||
nodesToVisit.push(kids[last]);
|
nodesToVisit.push(kids[last]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
capability.reject(
|
capability.reject(new Error(`Page index ${pageIndex} not found.`));
|
||||||
new PageDictMissingException(`Page index ${pageIndex} not found.`)
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
next();
|
next();
|
||||||
return capability.promise;
|
return capability.promise;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Eagerly fetches the entire /Pages-tree; should ONLY be used as a fallback.
|
||||||
|
* @returns {Map}
|
||||||
|
*/
|
||||||
|
getAllPageDicts() {
|
||||||
|
const queue = [{ currentNode: this.toplevelPagesDict, posInKids: 0 }];
|
||||||
|
const visitedNodes = new RefSet();
|
||||||
|
const map = new Map();
|
||||||
|
let pageIndex = 0;
|
||||||
|
|
||||||
|
function addPageDict(pageDict, pageRef) {
|
||||||
|
map.set(pageIndex++, [pageDict, pageRef]);
|
||||||
|
}
|
||||||
|
function addPageError(msg) {
|
||||||
|
map.set(pageIndex++, [new FormatError(msg), null]);
|
||||||
|
}
|
||||||
|
|
||||||
|
while (queue.length > 0) {
|
||||||
|
const queueItem = queue[queue.length - 1];
|
||||||
|
const { currentNode, posInKids } = queueItem;
|
||||||
|
|
||||||
|
let kids;
|
||||||
|
try {
|
||||||
|
kids = currentNode.get("Kids");
|
||||||
|
} catch (ex) {
|
||||||
|
if (ex instanceof MissingDataException) {
|
||||||
|
throw ex;
|
||||||
|
}
|
||||||
|
if (ex instanceof XRefEntryException) {
|
||||||
|
throw ex;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!Array.isArray(kids)) {
|
||||||
|
addPageError("Page dictionary kids object is not an array.");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (posInKids >= kids.length) {
|
||||||
|
queue.pop();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const kidObj = kids[posInKids];
|
||||||
|
let obj;
|
||||||
|
if (kidObj instanceof Ref) {
|
||||||
|
try {
|
||||||
|
obj = this.xref.fetch(kidObj);
|
||||||
|
} catch (ex) {
|
||||||
|
if (ex instanceof MissingDataException) {
|
||||||
|
throw ex;
|
||||||
|
}
|
||||||
|
if (ex instanceof XRefEntryException) {
|
||||||
|
throw ex;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Prevent circular references in the /Pages tree.
|
||||||
|
if (visitedNodes.has(kidObj)) {
|
||||||
|
addPageError("Pages tree contains circular reference.");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
visitedNodes.put(kidObj);
|
||||||
|
} else {
|
||||||
|
// Prevent errors in corrupt PDF documents that violate the
|
||||||
|
// specification by *inlining* Page dicts directly in the Kids
|
||||||
|
// array, rather than using indirect objects (see issue9540.pdf).
|
||||||
|
obj = kidObj;
|
||||||
|
}
|
||||||
|
if (!(obj instanceof Dict)) {
|
||||||
|
addPageError(
|
||||||
|
"Page dictionary kid reference points to wrong type of object."
|
||||||
|
);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isDict(obj, "Page") || !obj.has("Kids")) {
|
||||||
|
addPageDict(obj, kidObj instanceof Ref ? kidObj : null);
|
||||||
|
} else {
|
||||||
|
queue.push({ currentNode: obj, posInKids: 0 });
|
||||||
|
}
|
||||||
|
queueItem.posInKids++;
|
||||||
|
}
|
||||||
|
return map;
|
||||||
|
}
|
||||||
|
|
||||||
getPageIndex(pageRef) {
|
getPageIndex(pageRef) {
|
||||||
const cachedPageIndex = this.pageIndexCache.get(pageRef);
|
const cachedPageIndex = this.pageIndexCache.get(pageRef);
|
||||||
if (cachedPageIndex !== undefined) {
|
if (cachedPageIndex !== undefined) {
|
||||||
|
@ -60,12 +60,6 @@ class MissingDataException extends BaseException {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
class PageDictMissingException extends BaseException {
|
|
||||||
constructor(msg) {
|
|
||||||
super(msg, "PageDictMissingException");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
class ParserEOFException extends BaseException {
|
class ParserEOFException extends BaseException {
|
||||||
constructor(msg) {
|
constructor(msg) {
|
||||||
super(msg, "ParserEOFException");
|
super(msg, "ParserEOFException");
|
||||||
@ -547,7 +541,6 @@ export {
|
|||||||
isWhiteSpace,
|
isWhiteSpace,
|
||||||
log2,
|
log2,
|
||||||
MissingDataException,
|
MissingDataException,
|
||||||
PageDictMissingException,
|
|
||||||
ParserEOFException,
|
ParserEOFException,
|
||||||
parseXFAPath,
|
parseXFAPath,
|
||||||
readInt8,
|
readInt8,
|
||||||
|
@ -50,7 +50,6 @@ import {
|
|||||||
getInheritableProperty,
|
getInheritableProperty,
|
||||||
isWhiteSpace,
|
isWhiteSpace,
|
||||||
MissingDataException,
|
MissingDataException,
|
||||||
PageDictMissingException,
|
|
||||||
validateCSSFont,
|
validateCSSFont,
|
||||||
XRefEntryException,
|
XRefEntryException,
|
||||||
XRefParseException,
|
XRefParseException,
|
||||||
@ -1354,14 +1353,16 @@ class PDFDocument {
|
|||||||
}
|
}
|
||||||
|
|
||||||
async checkLastPage(recoveryMode = false) {
|
async checkLastPage(recoveryMode = false) {
|
||||||
this.catalog.setActualNumPages(); // Ensure that it's always reset.
|
const { catalog, pdfManager } = this;
|
||||||
|
|
||||||
|
catalog.setActualNumPages(); // Ensure that it's always reset.
|
||||||
let numPages;
|
let numPages;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await Promise.all([
|
await Promise.all([
|
||||||
this.pdfManager.ensureDoc("xfaFactory"),
|
pdfManager.ensureDoc("xfaFactory"),
|
||||||
this.pdfManager.ensureDoc("linearization"),
|
pdfManager.ensureDoc("linearization"),
|
||||||
this.pdfManager.ensureCatalog("numPages"),
|
pdfManager.ensureCatalog("numPages"),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
if (this.xfaFactory) {
|
if (this.xfaFactory) {
|
||||||
@ -1369,13 +1370,13 @@ class PDFDocument {
|
|||||||
} else if (this.linearization) {
|
} else if (this.linearization) {
|
||||||
numPages = this.linearization.numPages;
|
numPages = this.linearization.numPages;
|
||||||
} else {
|
} else {
|
||||||
numPages = this.catalog.numPages;
|
numPages = catalog.numPages;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (numPages === 1) {
|
if (!Number.isInteger(numPages)) {
|
||||||
return;
|
|
||||||
} else if (!Number.isInteger(numPages)) {
|
|
||||||
throw new FormatError("Page count is not an integer.");
|
throw new FormatError("Page count is not an integer.");
|
||||||
|
} else if (numPages <= 1) {
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
await this.getPage(numPages - 1);
|
await this.getPage(numPages - 1);
|
||||||
} catch (reason) {
|
} catch (reason) {
|
||||||
@ -1385,24 +1386,48 @@ class PDFDocument {
|
|||||||
// subsequent `this.getPage` calls.
|
// subsequent `this.getPage` calls.
|
||||||
await this.cleanup();
|
await this.cleanup();
|
||||||
|
|
||||||
let pageIndex = 1; // The first page was already loaded.
|
let pagesTree;
|
||||||
while (true) {
|
try {
|
||||||
try {
|
pagesTree = await pdfManager.ensureCatalog("getAllPageDicts");
|
||||||
await this.getPage(pageIndex);
|
} catch (reasonAll) {
|
||||||
} catch (reasonLoop) {
|
if (reasonAll instanceof XRefEntryException) {
|
||||||
if (reasonLoop instanceof PageDictMissingException) {
|
if (!recoveryMode) {
|
||||||
break;
|
throw new XRefParseException();
|
||||||
}
|
|
||||||
if (reasonLoop instanceof XRefEntryException) {
|
|
||||||
if (!recoveryMode) {
|
|
||||||
throw new XRefParseException();
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pageIndex++;
|
catalog.setActualNumPages(1);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
this.catalog.setActualNumPages(pageIndex);
|
|
||||||
|
for (const [pageIndex, [pageDict, ref]] of pagesTree) {
|
||||||
|
let promise;
|
||||||
|
if (pageDict instanceof Error) {
|
||||||
|
promise = Promise.reject(pageDict);
|
||||||
|
|
||||||
|
// Prevent "uncaught exception: Object"-messages in the console.
|
||||||
|
promise.catch(() => {});
|
||||||
|
} else {
|
||||||
|
promise = Promise.resolve(
|
||||||
|
new Page({
|
||||||
|
pdfManager,
|
||||||
|
xref: this.xref,
|
||||||
|
pageIndex,
|
||||||
|
pageDict,
|
||||||
|
ref,
|
||||||
|
globalIdFactory: this._globalIdFactory,
|
||||||
|
fontCache: catalog.fontCache,
|
||||||
|
builtInCMapCache: catalog.builtInCMapCache,
|
||||||
|
standardFontDataCache: catalog.standardFontDataCache,
|
||||||
|
globalImageCache: catalog.globalImageCache,
|
||||||
|
nonBlendModesSet: catalog.nonBlendModesSet,
|
||||||
|
xfaFactory: null,
|
||||||
|
})
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
this._pagePromises.set(pageIndex, promise);
|
||||||
|
}
|
||||||
|
catalog.setActualNumPages(pagesTree.size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
2
test/pdfs/.gitignore
vendored
2
test/pdfs/.gitignore
vendored
@ -492,6 +492,8 @@
|
|||||||
!xfa_issue14315.pdf
|
!xfa_issue14315.pdf
|
||||||
!poppler-67295-0.pdf
|
!poppler-67295-0.pdf
|
||||||
!poppler-85140-0.pdf
|
!poppler-85140-0.pdf
|
||||||
|
!poppler-395-0-fuzzed.pdf
|
||||||
|
!GHOSTSCRIPT-698804-1-fuzzed.pdf
|
||||||
!poppler-91414-0-53.pdf
|
!poppler-91414-0-53.pdf
|
||||||
!poppler-91414-0-54.pdf
|
!poppler-91414-0-54.pdf
|
||||||
!poppler-742-0-fuzzed.pdf
|
!poppler-742-0-fuzzed.pdf
|
||||||
|
69
test/pdfs/GHOSTSCRIPT-698804-1-fuzzed.pdf
Normal file
69
test/pdfs/GHOSTSCRIPT-698804-1-fuzzed.pdf
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
%PDF-1.4
|
||||||
|
%âãÏÓ
|
||||||
|
|
||||||
|
1 0 obj
|
||||||
|
<<
|
||||||
|
/Type /Catalog
|
||||||
|
/Outline 2 0 R
|
||||||
|
/Pages 3 0 R
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
|
||||||
|
2 0 obj
|
||||||
|
<<
|
||||||
|
/Type /Outlines
|
||||||
|
/Count 0
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
|
||||||
|
3 0 obj
|
||||||
|
<<
|
||||||
|
/Type /Pages
|
||||||
|
/Kids [ 4 0 R ]
|
||||||
|
/Count 1
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
|
||||||
|
4 0 obj
|
||||||
|
<<
|
||||||
|
/Type /Page
|
||||||
|
/Parent 3 0 R
|
||||||
|
/MediaBox [ 0 0 612 792 ]
|
||||||
|
/Contents 5 0 R
|
||||||
|
/Resources <<
|
||||||
|
/ProcSet 6 0 R
|
||||||
|
>>
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
|
||||||
|
5 0 obj
|
||||||
|
<<
|
||||||
|
/Length 0
|
||||||
|
>>
|
||||||
|
stream
|
||||||
|
endstream
|
||||||
|
endobj
|
||||||
|
|
||||||
|
6 0 obj
|
||||||
|
[ /PDF ]
|
||||||
|
endobj
|
||||||
|
|
||||||
|
xref
|
||||||
|
0 2
|
||||||
|
0000000000 65536 f
|
||||||
|
0000000016 00000 n
|
||||||
|
00000004294967296 3
|
||||||
|
0000000138 00000 n
|
||||||
|
0000000204 00000 n
|
||||||
|
0000000342 00000 n
|
||||||
|
|
||||||
|
|
||||||
|
trailer
|
||||||
|
<<
|
||||||
|
/Size 7
|
||||||
|
/Root 1 0 R
|
||||||
|
>>
|
||||||
|
|
||||||
|
startxref
|
||||||
|
418
|
||||||
|
%%EOF
|
262
test/pdfs/poppler-395-0-fuzzed.pdf
Normal file
262
test/pdfs/poppler-395-0-fuzzed.pdf
Normal file
File diff suppressed because one or more lines are too long
@ -495,14 +495,27 @@ describe("api", function () {
|
|||||||
const loadingTask2 = getDocument(
|
const loadingTask2 = getDocument(
|
||||||
buildGetDocumentParams("poppler-85140-0.pdf")
|
buildGetDocumentParams("poppler-85140-0.pdf")
|
||||||
);
|
);
|
||||||
|
const loadingTask3 = getDocument(
|
||||||
|
buildGetDocumentParams("poppler-395-0-fuzzed.pdf")
|
||||||
|
);
|
||||||
|
const loadingTask4 = getDocument(
|
||||||
|
buildGetDocumentParams("GHOSTSCRIPT-698804-1-fuzzed.pdf")
|
||||||
|
);
|
||||||
|
|
||||||
expect(loadingTask1 instanceof PDFDocumentLoadingTask).toEqual(true);
|
expect(loadingTask1 instanceof PDFDocumentLoadingTask).toEqual(true);
|
||||||
expect(loadingTask2 instanceof PDFDocumentLoadingTask).toEqual(true);
|
expect(loadingTask2 instanceof PDFDocumentLoadingTask).toEqual(true);
|
||||||
|
expect(loadingTask3 instanceof PDFDocumentLoadingTask).toEqual(true);
|
||||||
|
expect(loadingTask4 instanceof PDFDocumentLoadingTask).toEqual(true);
|
||||||
|
|
||||||
const pdfDocument1 = await loadingTask1.promise;
|
const pdfDocument1 = await loadingTask1.promise;
|
||||||
const pdfDocument2 = await loadingTask2.promise;
|
const pdfDocument2 = await loadingTask2.promise;
|
||||||
|
const pdfDocument3 = await loadingTask3.promise;
|
||||||
|
const pdfDocument4 = await loadingTask4.promise;
|
||||||
|
|
||||||
expect(pdfDocument1.numPages).toEqual(1);
|
expect(pdfDocument1.numPages).toEqual(1);
|
||||||
expect(pdfDocument2.numPages).toEqual(1);
|
expect(pdfDocument2.numPages).toEqual(1);
|
||||||
|
expect(pdfDocument3.numPages).toEqual(1);
|
||||||
|
expect(pdfDocument4.numPages).toEqual(1);
|
||||||
|
|
||||||
const pageA = await pdfDocument1.getPage(1);
|
const pageA = await pdfDocument1.getPage(1);
|
||||||
expect(pageA instanceof PDFPageProxy).toEqual(true);
|
expect(pageA instanceof PDFPageProxy).toEqual(true);
|
||||||
@ -516,6 +529,28 @@ describe("api", function () {
|
|||||||
expect(reason instanceof UnknownErrorException).toEqual(true);
|
expect(reason instanceof UnknownErrorException).toEqual(true);
|
||||||
expect(reason.message).toEqual("Bad (uncompressed) XRef entry: 3R");
|
expect(reason.message).toEqual("Bad (uncompressed) XRef entry: 3R");
|
||||||
}
|
}
|
||||||
|
try {
|
||||||
|
await pdfDocument3.getPage(1);
|
||||||
|
|
||||||
|
// Shouldn't get here.
|
||||||
|
expect(false).toEqual(true);
|
||||||
|
} catch (reason) {
|
||||||
|
expect(reason instanceof UnknownErrorException).toEqual(true);
|
||||||
|
expect(reason.message).toEqual(
|
||||||
|
"Page dictionary kid reference points to wrong type of object."
|
||||||
|
);
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
await pdfDocument4.getPage(1);
|
||||||
|
|
||||||
|
// Shouldn't get here.
|
||||||
|
expect(false).toEqual(true);
|
||||||
|
} catch (reason) {
|
||||||
|
expect(reason instanceof UnknownErrorException).toEqual(true);
|
||||||
|
expect(reason.message).toEqual(
|
||||||
|
"Page dictionary kid reference points to wrong type of object."
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
await Promise.all([loadingTask1.destroy(), loadingTask2.destroy()]);
|
await Promise.all([loadingTask1.destroy(), loadingTask2.destroy()]);
|
||||||
});
|
});
|
||||||
|
Loading…
Reference in New Issue
Block a user