From ad3a271fc40c71b1a1c8ac2ca4c09bc31e4a5df3 Mon Sep 17 00:00:00 2001 From: Jonas Jenwald Date: Thu, 2 Dec 2021 16:40:31 +0100 Subject: [PATCH] [api-minor] Clear all caches in `XRef.indexObjects`, and improve /Root dictionary validation in `XRef.parse` (issue 14303) *This patch improves handling of a couple of PDF documents from issue 14303.* - Update `XRef.indexObjects` to actually clear *all* XRef-caches. Invalid XRef tables *usually* cause issues early enough during parsing that we've not populated the XRef-cache, however to prevent any issues we obviously need to clear that one as well. - Improve the /Root dictionary validation in `XRef.parse` (PR 9827 follow-up). In addition to checking that a /Pages entry exists, we'll now also check that it can be successfully fetched *and* that it's of the correct type. There's really no point trying to use a /Root dictionary that e.g. `Catalog.toplevelPagesDict` will reject, and this way we'll be able to fallback to indexing the objects in corrupt documents. - Throw an `InvalidPDFException`, rather than a general `FormatError`, in `XRef.parse` when no usable /Root dictionary could be found. That really seems more appropriate overall, since all attempts at parsing/recovery have failed. (This part of the patch is API-observable, hence the tag.) With these changes, two existing test-cases are improved and the unit-tests are updated/re-factored to highlight that. In particular `GHOSTSCRIPT-698804-1-fuzzed.pdf` will now both load and "render" correctly, whereas `poppler-395-0-fuzzed.pdf` will now fail immediately upon loading (rather than *appearing* to work). --- src/core/xref.js | 25 +++++++++---- test/unit/api_spec.js | 84 ++++++++++++++++++++++++------------------- 2 files changed, 66 insertions(+), 43 deletions(-) diff --git a/src/core/xref.js b/src/core/xref.js index 7c2c2172a..acd3ec3a4 100644 --- a/src/core/xref.js +++ b/src/core/xref.js @@ -107,14 +107,26 @@ class XRef { } warn(`XRef.parse - Invalid "Root" reference: "${ex}".`); } - if (root instanceof Dict && root.has("Pages")) { - this.root = root; - } else { - if (!recoveryMode) { - throw new XRefParseException(); + if (root instanceof Dict) { + try { + const pages = root.get("Pages"); + if (pages instanceof Dict) { + this.root = root; + return; + } + } catch (ex) { + if (ex instanceof MissingDataException) { + throw ex; + } + warn(`XRef.parse - Invalid "Pages" reference: "${ex}".`); } - throw new FormatError("Invalid root reference"); } + + if (!recoveryMode) { + throw new XRefParseException(); + } + // Even recovery failed, there's nothing more we can do here. + throw new InvalidPDFException("Invalid Root reference."); } processXRefTable(parser) { @@ -417,6 +429,7 @@ class XRef { // Clear out any existing entries, since they may be bogus. this.entries.length = 0; + this._cacheMap.clear(); const stream = this.stream; stream.pos = 0; diff --git a/test/unit/api_spec.js b/test/unit/api_spec.js index 81b9f0d48..a5552d52e 100644 --- a/test/unit/api_spec.js +++ b/test/unit/api_spec.js @@ -445,7 +445,7 @@ describe("api", function () { await Promise.all([loadingTask1.destroy(), loadingTask2.destroy()]); }); - it("creates pdf doc from PDF file with bad XRef table", async function () { + it("creates pdf doc from PDF file with bad XRef entry", async function () { // A corrupt PDF file, where the XRef table have (some) bogus entries. const loadingTask = getDocument( buildGetDocumentParams("PDFBOX-4352-0.pdf", { @@ -468,6 +468,26 @@ describe("api", function () { await loadingTask.destroy(); }); + it("creates pdf doc from PDF file with bad XRef header", async function () { + const loadingTask = getDocument( + buildGetDocumentParams("GHOSTSCRIPT-698804-1-fuzzed.pdf") + ); + expect(loadingTask instanceof PDFDocumentLoadingTask).toEqual(true); + + const pdfDocument = await loadingTask.promise; + expect(pdfDocument.numPages).toEqual(1); + + const page = await pdfDocument.getPage(1); + expect(page instanceof PDFPageProxy).toEqual(true); + + const opList = await page.getOperatorList(); + expect(opList.fnArray.length).toEqual(0); + expect(opList.argsArray.length).toEqual(0); + expect(opList.lastChunk).toEqual(true); + + await loadingTask.destroy(); + }); + it("creates pdf doc from PDF file with bad XRef byteWidths", async function () { // A corrupt PDF file, where the XRef /W-array have (some) bogus entries. const loadingTask = getDocument( @@ -488,6 +508,25 @@ describe("api", function () { await loadingTask.destroy(); }); + it("creates pdf doc from PDF file with inaccessible /Pages tree", async function () { + const loadingTask = getDocument( + buildGetDocumentParams("poppler-395-0-fuzzed.pdf") + ); + expect(loadingTask instanceof PDFDocumentLoadingTask).toEqual(true); + + try { + await loadingTask.promise; + + // Shouldn't get here. + expect(false).toEqual(true); + } catch (reason) { + expect(reason instanceof InvalidPDFException).toEqual(true); + expect(reason.message).toEqual("Invalid Root reference."); + } + + await loadingTask.destroy(); + }); + it("creates pdf doc from PDF files, with bad /Pages tree /Count", async function () { const loadingTask1 = getDocument( buildGetDocumentParams("poppler-67295-0.pdf") @@ -495,30 +534,23 @@ describe("api", function () { const loadingTask2 = getDocument( buildGetDocumentParams("poppler-85140-0.pdf") ); - const loadingTask3 = getDocument( - buildGetDocumentParams("poppler-395-0-fuzzed.pdf") - ); - const loadingTask4 = getDocument( - buildGetDocumentParams("GHOSTSCRIPT-698804-1-fuzzed.pdf") - ); expect(loadingTask1 instanceof PDFDocumentLoadingTask).toEqual(true); expect(loadingTask2 instanceof PDFDocumentLoadingTask).toEqual(true); - expect(loadingTask3 instanceof PDFDocumentLoadingTask).toEqual(true); - expect(loadingTask4 instanceof PDFDocumentLoadingTask).toEqual(true); const pdfDocument1 = await loadingTask1.promise; const pdfDocument2 = await loadingTask2.promise; - const pdfDocument3 = await loadingTask3.promise; - const pdfDocument4 = await loadingTask4.promise; expect(pdfDocument1.numPages).toEqual(1); expect(pdfDocument2.numPages).toEqual(1); - expect(pdfDocument3.numPages).toEqual(1); - expect(pdfDocument4.numPages).toEqual(1); - const pageA = await pdfDocument1.getPage(1); - expect(pageA instanceof PDFPageProxy).toEqual(true); + const page = await pdfDocument1.getPage(1); + expect(page instanceof PDFPageProxy).toEqual(true); + + const opList = await page.getOperatorList(); + expect(opList.fnArray.length).toBeGreaterThan(5); + expect(opList.argsArray.length).toBeGreaterThan(5); + expect(opList.lastChunk).toEqual(true); try { await pdfDocument2.getPage(1); @@ -529,28 +561,6 @@ describe("api", function () { expect(reason instanceof UnknownErrorException).toEqual(true); expect(reason.message).toEqual("Bad (uncompressed) XRef entry: 3R"); } - try { - await pdfDocument3.getPage(1); - - // Shouldn't get here. - expect(false).toEqual(true); - } catch (reason) { - expect(reason instanceof UnknownErrorException).toEqual(true); - expect(reason.message).toEqual( - "Page dictionary kid reference points to wrong type of object." - ); - } - try { - await pdfDocument4.getPage(1); - - // Shouldn't get here. - expect(false).toEqual(true); - } catch (reason) { - expect(reason instanceof UnknownErrorException).toEqual(true); - expect(reason.message).toEqual( - "Page dictionary kid reference points to wrong type of object." - ); - } await Promise.all([loadingTask1.destroy(), loadingTask2.destroy()]); });