[api-minor] Clear all caches in XRef.indexObjects
, and improve /Root dictionary validation in XRef.parse
(issue 14303)
*This patch improves handling of a couple of PDF documents from issue 14303.* - Update `XRef.indexObjects` to actually clear *all* XRef-caches. Invalid XRef tables *usually* cause issues early enough during parsing that we've not populated the XRef-cache, however to prevent any issues we obviously need to clear that one as well. - Improve the /Root dictionary validation in `XRef.parse` (PR 9827 follow-up). In addition to checking that a /Pages entry exists, we'll now also check that it can be successfully fetched *and* that it's of the correct type. There's really no point trying to use a /Root dictionary that e.g. `Catalog.toplevelPagesDict` will reject, and this way we'll be able to fallback to indexing the objects in corrupt documents. - Throw an `InvalidPDFException`, rather than a general `FormatError`, in `XRef.parse` when no usable /Root dictionary could be found. That really seems more appropriate overall, since all attempts at parsing/recovery have failed. (This part of the patch is API-observable, hence the tag.) With these changes, two existing test-cases are improved and the unit-tests are updated/re-factored to highlight that. In particular `GHOSTSCRIPT-698804-1-fuzzed.pdf` will now both load and "render" correctly, whereas `poppler-395-0-fuzzed.pdf` will now fail immediately upon loading (rather than *appearing* to work).
This commit is contained in:
parent
e9e4b913c0
commit
ad3a271fc4
@ -107,14 +107,26 @@ class XRef {
|
||||
}
|
||||
warn(`XRef.parse - Invalid "Root" reference: "${ex}".`);
|
||||
}
|
||||
if (root instanceof Dict && root.has("Pages")) {
|
||||
this.root = root;
|
||||
} else {
|
||||
if (!recoveryMode) {
|
||||
throw new XRefParseException();
|
||||
if (root instanceof Dict) {
|
||||
try {
|
||||
const pages = root.get("Pages");
|
||||
if (pages instanceof Dict) {
|
||||
this.root = root;
|
||||
return;
|
||||
}
|
||||
} catch (ex) {
|
||||
if (ex instanceof MissingDataException) {
|
||||
throw ex;
|
||||
}
|
||||
warn(`XRef.parse - Invalid "Pages" reference: "${ex}".`);
|
||||
}
|
||||
throw new FormatError("Invalid root reference");
|
||||
}
|
||||
|
||||
if (!recoveryMode) {
|
||||
throw new XRefParseException();
|
||||
}
|
||||
// Even recovery failed, there's nothing more we can do here.
|
||||
throw new InvalidPDFException("Invalid Root reference.");
|
||||
}
|
||||
|
||||
processXRefTable(parser) {
|
||||
@ -417,6 +429,7 @@ class XRef {
|
||||
|
||||
// Clear out any existing entries, since they may be bogus.
|
||||
this.entries.length = 0;
|
||||
this._cacheMap.clear();
|
||||
|
||||
const stream = this.stream;
|
||||
stream.pos = 0;
|
||||
|
@ -445,7 +445,7 @@ describe("api", function () {
|
||||
await Promise.all([loadingTask1.destroy(), loadingTask2.destroy()]);
|
||||
});
|
||||
|
||||
it("creates pdf doc from PDF file with bad XRef table", async function () {
|
||||
it("creates pdf doc from PDF file with bad XRef entry", async function () {
|
||||
// A corrupt PDF file, where the XRef table have (some) bogus entries.
|
||||
const loadingTask = getDocument(
|
||||
buildGetDocumentParams("PDFBOX-4352-0.pdf", {
|
||||
@ -468,6 +468,26 @@ describe("api", function () {
|
||||
await loadingTask.destroy();
|
||||
});
|
||||
|
||||
it("creates pdf doc from PDF file with bad XRef header", async function () {
|
||||
const loadingTask = getDocument(
|
||||
buildGetDocumentParams("GHOSTSCRIPT-698804-1-fuzzed.pdf")
|
||||
);
|
||||
expect(loadingTask instanceof PDFDocumentLoadingTask).toEqual(true);
|
||||
|
||||
const pdfDocument = await loadingTask.promise;
|
||||
expect(pdfDocument.numPages).toEqual(1);
|
||||
|
||||
const page = await pdfDocument.getPage(1);
|
||||
expect(page instanceof PDFPageProxy).toEqual(true);
|
||||
|
||||
const opList = await page.getOperatorList();
|
||||
expect(opList.fnArray.length).toEqual(0);
|
||||
expect(opList.argsArray.length).toEqual(0);
|
||||
expect(opList.lastChunk).toEqual(true);
|
||||
|
||||
await loadingTask.destroy();
|
||||
});
|
||||
|
||||
it("creates pdf doc from PDF file with bad XRef byteWidths", async function () {
|
||||
// A corrupt PDF file, where the XRef /W-array have (some) bogus entries.
|
||||
const loadingTask = getDocument(
|
||||
@ -488,6 +508,25 @@ describe("api", function () {
|
||||
await loadingTask.destroy();
|
||||
});
|
||||
|
||||
it("creates pdf doc from PDF file with inaccessible /Pages tree", async function () {
|
||||
const loadingTask = getDocument(
|
||||
buildGetDocumentParams("poppler-395-0-fuzzed.pdf")
|
||||
);
|
||||
expect(loadingTask instanceof PDFDocumentLoadingTask).toEqual(true);
|
||||
|
||||
try {
|
||||
await loadingTask.promise;
|
||||
|
||||
// Shouldn't get here.
|
||||
expect(false).toEqual(true);
|
||||
} catch (reason) {
|
||||
expect(reason instanceof InvalidPDFException).toEqual(true);
|
||||
expect(reason.message).toEqual("Invalid Root reference.");
|
||||
}
|
||||
|
||||
await loadingTask.destroy();
|
||||
});
|
||||
|
||||
it("creates pdf doc from PDF files, with bad /Pages tree /Count", async function () {
|
||||
const loadingTask1 = getDocument(
|
||||
buildGetDocumentParams("poppler-67295-0.pdf")
|
||||
@ -495,30 +534,23 @@ describe("api", function () {
|
||||
const loadingTask2 = getDocument(
|
||||
buildGetDocumentParams("poppler-85140-0.pdf")
|
||||
);
|
||||
const loadingTask3 = getDocument(
|
||||
buildGetDocumentParams("poppler-395-0-fuzzed.pdf")
|
||||
);
|
||||
const loadingTask4 = getDocument(
|
||||
buildGetDocumentParams("GHOSTSCRIPT-698804-1-fuzzed.pdf")
|
||||
);
|
||||
|
||||
expect(loadingTask1 instanceof PDFDocumentLoadingTask).toEqual(true);
|
||||
expect(loadingTask2 instanceof PDFDocumentLoadingTask).toEqual(true);
|
||||
expect(loadingTask3 instanceof PDFDocumentLoadingTask).toEqual(true);
|
||||
expect(loadingTask4 instanceof PDFDocumentLoadingTask).toEqual(true);
|
||||
|
||||
const pdfDocument1 = await loadingTask1.promise;
|
||||
const pdfDocument2 = await loadingTask2.promise;
|
||||
const pdfDocument3 = await loadingTask3.promise;
|
||||
const pdfDocument4 = await loadingTask4.promise;
|
||||
|
||||
expect(pdfDocument1.numPages).toEqual(1);
|
||||
expect(pdfDocument2.numPages).toEqual(1);
|
||||
expect(pdfDocument3.numPages).toEqual(1);
|
||||
expect(pdfDocument4.numPages).toEqual(1);
|
||||
|
||||
const pageA = await pdfDocument1.getPage(1);
|
||||
expect(pageA instanceof PDFPageProxy).toEqual(true);
|
||||
const page = await pdfDocument1.getPage(1);
|
||||
expect(page instanceof PDFPageProxy).toEqual(true);
|
||||
|
||||
const opList = await page.getOperatorList();
|
||||
expect(opList.fnArray.length).toBeGreaterThan(5);
|
||||
expect(opList.argsArray.length).toBeGreaterThan(5);
|
||||
expect(opList.lastChunk).toEqual(true);
|
||||
|
||||
try {
|
||||
await pdfDocument2.getPage(1);
|
||||
@ -529,28 +561,6 @@ describe("api", function () {
|
||||
expect(reason instanceof UnknownErrorException).toEqual(true);
|
||||
expect(reason.message).toEqual("Bad (uncompressed) XRef entry: 3R");
|
||||
}
|
||||
try {
|
||||
await pdfDocument3.getPage(1);
|
||||
|
||||
// Shouldn't get here.
|
||||
expect(false).toEqual(true);
|
||||
} catch (reason) {
|
||||
expect(reason instanceof UnknownErrorException).toEqual(true);
|
||||
expect(reason.message).toEqual(
|
||||
"Page dictionary kid reference points to wrong type of object."
|
||||
);
|
||||
}
|
||||
try {
|
||||
await pdfDocument4.getPage(1);
|
||||
|
||||
// Shouldn't get here.
|
||||
expect(false).toEqual(true);
|
||||
} catch (reason) {
|
||||
expect(reason instanceof UnknownErrorException).toEqual(true);
|
||||
expect(reason.message).toEqual(
|
||||
"Page dictionary kid reference points to wrong type of object."
|
||||
);
|
||||
}
|
||||
|
||||
await Promise.all([loadingTask1.destroy(), loadingTask2.destroy()]);
|
||||
});
|
||||
|
Loading…
x
Reference in New Issue
Block a user