[api-minor] Validate the /Pages-tree /Count entry during document initialization (issue 14303)
*This patch basically extends the approach from PR 10392, by also checking the last page.* Currently, in e.g. the `Catalog.numPages`-getter, we're simply assuming that if the /Pages-tree has an *integer* /Count entry it must also be correct/valid. As can be seen in the referenced PDF documents, that entry may be completely bogus which causes general parsing to breaking down elsewhere in the worker-thread (and hanging the browser). Rather than hoping that the /Count entry is correct, similar to all other data found in PDF documents, we obviously need to validate it. This turns out to be a little less straightforward than one would like, since the only way to do this (as far as I know) is to parse the *entire* /Pages-tree and essentially counting the pages. To avoid doing that for all documents, this patch tries to take a short-cut by checking if the last page (based on the /Count entry) can be successfully fetched. If so, we assume that the /Count entry is correct and use it as-is, otherwise we'll iterate through (potentially) the *entire* /Pages-tree to determine the number of pages. Unfortunately these changes will have a number of *somewhat* negative side-effects, please see a possibly incomplete list below, however I cannot see a better way to address this bug. - This will slow down initial loading/rendering of all documents, at least by some amount, since we now need to fetch/parse more of the /Pages-tree in order to be able to access the *last* page of the PDF documents. - For poorly generated PDF documents, where the entire /Pages-tree only has *one* level, we'll unfortunately need to fetch/parse the *entire* /Pages-tree to get to the last page. While there's a cache to help reduce repeated data lookups, this will affect initial loading/rendering of *some* long PDF documents, - This will affect the `disableAutoFetch = true` mode negatively, since we now need to fetch/parse more data during document initialization. While the `disableAutoFetch = true` mode should still be helpful in larger/longer PDF documents, for smaller ones the effect/usefulness may unfortunately be lost. As one *small* additional bonus, we should now also be able to support opening PDF documents where the /Pages-tree /Count entry is completely invalid (e.g. contains a non-integer value). Fixes two of the issues listed in issue 14303, namely the `poppler-67295-0.pdf` and `poppler-85140-0.pdf` documents.
This commit is contained in:
parent
9a1e27efc5
commit
d0c4bbd828
@ -28,6 +28,7 @@ import {
|
|||||||
import {
|
import {
|
||||||
collectActions,
|
collectActions,
|
||||||
MissingDataException,
|
MissingDataException,
|
||||||
|
PageDictMissingException,
|
||||||
recoverJsURL,
|
recoverJsURL,
|
||||||
toRomanNumerals,
|
toRomanNumerals,
|
||||||
} from "./core_utils.js";
|
} from "./core_utils.js";
|
||||||
@ -70,6 +71,7 @@ class Catalog {
|
|||||||
if (!isDict(this._catDict)) {
|
if (!isDict(this._catDict)) {
|
||||||
throw new FormatError("Catalog object is not a dictionary.");
|
throw new FormatError("Catalog object is not a dictionary.");
|
||||||
}
|
}
|
||||||
|
this._actualNumPages = null;
|
||||||
|
|
||||||
this.fontCache = new RefSetCache();
|
this.fontCache = new RefSetCache();
|
||||||
this.builtInCMapCache = new Map();
|
this.builtInCMapCache = new Map();
|
||||||
@ -534,14 +536,26 @@ class Catalog {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
get numPages() {
|
setActualNumPages(num = null) {
|
||||||
|
this._actualNumPages = num;
|
||||||
|
}
|
||||||
|
|
||||||
|
get hasActualNumPages() {
|
||||||
|
return this._actualNumPages !== null;
|
||||||
|
}
|
||||||
|
|
||||||
|
get _pagesCount() {
|
||||||
const obj = this.toplevelPagesDict.get("Count");
|
const obj = this.toplevelPagesDict.get("Count");
|
||||||
if (!Number.isInteger(obj)) {
|
if (!Number.isInteger(obj)) {
|
||||||
throw new FormatError(
|
throw new FormatError(
|
||||||
"Page count in top-level pages dictionary is not an integer."
|
"Page count in top-level pages dictionary is not an integer."
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
return shadow(this, "numPages", obj);
|
return shadow(this, "_pagesCount", obj);
|
||||||
|
}
|
||||||
|
|
||||||
|
get numPages() {
|
||||||
|
return this.hasActualNumPages ? this._actualNumPages : this._pagesCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
get destinations() {
|
get destinations() {
|
||||||
@ -1071,7 +1085,7 @@ class Catalog {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
getPageDict(pageIndex) {
|
getPageDict(pageIndex, skipCount = false) {
|
||||||
const capability = createPromiseCapability();
|
const capability = createPromiseCapability();
|
||||||
const nodesToVisit = [this._catDict.getRaw("Pages")];
|
const nodesToVisit = [this._catDict.getRaw("Pages")];
|
||||||
const visitedNodes = new RefSet();
|
const visitedNodes = new RefSet();
|
||||||
@ -1133,7 +1147,7 @@ class Catalog {
|
|||||||
}
|
}
|
||||||
|
|
||||||
count = currentNode.get("Count");
|
count = currentNode.get("Count");
|
||||||
if (Number.isInteger(count) && count >= 0) {
|
if (Number.isInteger(count) && count >= 0 && !skipCount) {
|
||||||
// Cache the Kids count, since it can reduce redundant lookups in
|
// Cache the Kids count, since it can reduce redundant lookups in
|
||||||
// documents where all nodes are found at *one* level of the tree.
|
// documents where all nodes are found at *one* level of the tree.
|
||||||
const objId = currentNode.objId;
|
const objId = currentNode.objId;
|
||||||
@ -1177,7 +1191,9 @@ class Catalog {
|
|||||||
nodesToVisit.push(kids[last]);
|
nodesToVisit.push(kids[last]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
capability.reject(new Error(`Page index ${pageIndex} not found.`));
|
capability.reject(
|
||||||
|
new PageDictMissingException(`Page index ${pageIndex} not found.`)
|
||||||
|
);
|
||||||
}
|
}
|
||||||
next();
|
next();
|
||||||
return capability.promise;
|
return capability.promise;
|
||||||
|
@ -60,6 +60,12 @@ class MissingDataException extends BaseException {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class PageDictMissingException extends BaseException {
|
||||||
|
constructor(msg) {
|
||||||
|
super(msg, "PageDictMissingException");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
class ParserEOFException extends BaseException {
|
class ParserEOFException extends BaseException {
|
||||||
constructor(msg) {
|
constructor(msg) {
|
||||||
super(msg, "ParserEOFException");
|
super(msg, "ParserEOFException");
|
||||||
@ -541,6 +547,7 @@ export {
|
|||||||
isWhiteSpace,
|
isWhiteSpace,
|
||||||
log2,
|
log2,
|
||||||
MissingDataException,
|
MissingDataException,
|
||||||
|
PageDictMissingException,
|
||||||
ParserEOFException,
|
ParserEOFException,
|
||||||
parseXFAPath,
|
parseXFAPath,
|
||||||
readInt8,
|
readInt8,
|
||||||
|
@ -50,6 +50,7 @@ import {
|
|||||||
getInheritableProperty,
|
getInheritableProperty,
|
||||||
isWhiteSpace,
|
isWhiteSpace,
|
||||||
MissingDataException,
|
MissingDataException,
|
||||||
|
PageDictMissingException,
|
||||||
validateCSSFont,
|
validateCSSFont,
|
||||||
XRefEntryException,
|
XRefEntryException,
|
||||||
XRefParseException,
|
XRefParseException,
|
||||||
@ -787,7 +788,9 @@ class PDFDocument {
|
|||||||
|
|
||||||
get numPages() {
|
get numPages() {
|
||||||
let num = 0;
|
let num = 0;
|
||||||
if (this.xfaFactory) {
|
if (this.catalog.hasActualNumPages) {
|
||||||
|
num = this.catalog.numPages;
|
||||||
|
} else if (this.xfaFactory) {
|
||||||
// num is a Promise.
|
// num is a Promise.
|
||||||
num = this.xfaFactory.getNumPages();
|
num = this.xfaFactory.getNumPages();
|
||||||
} else if (this.linearization) {
|
} else if (this.linearization) {
|
||||||
@ -1331,8 +1334,13 @@ class PDFDocument {
|
|||||||
return promise;
|
return promise;
|
||||||
}
|
}
|
||||||
|
|
||||||
checkFirstPage() {
|
async checkFirstPage(recoveryMode = false) {
|
||||||
return this.getPage(0).catch(async reason => {
|
if (recoveryMode) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
await this.getPage(0);
|
||||||
|
} catch (reason) {
|
||||||
if (reason instanceof XRefEntryException) {
|
if (reason instanceof XRefEntryException) {
|
||||||
// Clear out the various caches to ensure that we haven't stored any
|
// Clear out the various caches to ensure that we haven't stored any
|
||||||
// inconsistent and/or incorrect state, since that could easily break
|
// inconsistent and/or incorrect state, since that could easily break
|
||||||
@ -1342,7 +1350,60 @@ class PDFDocument {
|
|||||||
|
|
||||||
throw new XRefParseException();
|
throw new XRefParseException();
|
||||||
}
|
}
|
||||||
});
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async checkLastPage(recoveryMode = false) {
|
||||||
|
this.catalog.setActualNumPages(); // Ensure that it's always reset.
|
||||||
|
let numPages;
|
||||||
|
|
||||||
|
try {
|
||||||
|
await Promise.all([
|
||||||
|
this.pdfManager.ensureDoc("xfaFactory"),
|
||||||
|
this.pdfManager.ensureDoc("linearization"),
|
||||||
|
this.pdfManager.ensureCatalog("numPages"),
|
||||||
|
]);
|
||||||
|
|
||||||
|
if (this.xfaFactory) {
|
||||||
|
return; // The Page count is always calculated for XFA-documents.
|
||||||
|
} else if (this.linearization) {
|
||||||
|
numPages = this.linearization.numPages;
|
||||||
|
} else {
|
||||||
|
numPages = this.catalog.numPages;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (numPages === 1) {
|
||||||
|
return;
|
||||||
|
} else if (!Number.isInteger(numPages)) {
|
||||||
|
throw new FormatError("Page count is not an integer.");
|
||||||
|
}
|
||||||
|
await this.getPage(numPages - 1);
|
||||||
|
} catch (reason) {
|
||||||
|
warn(`checkLastPage - invalid /Pages tree /Count: ${numPages}.`);
|
||||||
|
// Clear out the various caches to ensure that we haven't stored any
|
||||||
|
// inconsistent and/or incorrect state, since that could easily break
|
||||||
|
// subsequent `this.getPage` calls.
|
||||||
|
await this.cleanup();
|
||||||
|
|
||||||
|
let pageIndex = 1; // The first page was already loaded.
|
||||||
|
while (true) {
|
||||||
|
try {
|
||||||
|
await this.getPage(pageIndex, /* skipCount = */ true);
|
||||||
|
} catch (reasonLoop) {
|
||||||
|
if (reasonLoop instanceof PageDictMissingException) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (reasonLoop instanceof XRefEntryException) {
|
||||||
|
if (!recoveryMode) {
|
||||||
|
throw new XRefParseException();
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pageIndex++;
|
||||||
|
}
|
||||||
|
this.catalog.setActualNumPages(pageIndex);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fontFallback(id, handler) {
|
fontFallback(id, handler) {
|
||||||
|
@ -170,11 +170,12 @@ class WorkerMessageHandler {
|
|||||||
await pdfManager.ensureDoc("parseStartXRef");
|
await pdfManager.ensureDoc("parseStartXRef");
|
||||||
await pdfManager.ensureDoc("parse", [recoveryMode]);
|
await pdfManager.ensureDoc("parse", [recoveryMode]);
|
||||||
|
|
||||||
if (!recoveryMode) {
|
// Check that at least the first page can be successfully loaded,
|
||||||
// Check that at least the first page can be successfully loaded,
|
// since otherwise the XRef table is definitely not valid.
|
||||||
// since otherwise the XRef table is definitely not valid.
|
await pdfManager.ensureDoc("checkFirstPage", [recoveryMode]);
|
||||||
await pdfManager.ensureDoc("checkFirstPage");
|
// Check that the last page can be sucessfully loaded, to ensure that
|
||||||
}
|
// `numPages` is correct, and fallback to walking the entire /Pages-tree.
|
||||||
|
await pdfManager.ensureDoc("checkLastPage", [recoveryMode]);
|
||||||
|
|
||||||
const isPureXfa = await pdfManager.ensureDoc("isPureXfa");
|
const isPureXfa = await pdfManager.ensureDoc("isPureXfa");
|
||||||
if (isPureXfa) {
|
if (isPureXfa) {
|
||||||
|
2
test/pdfs/.gitignore
vendored
2
test/pdfs/.gitignore
vendored
@ -490,3 +490,5 @@
|
|||||||
!PDFBOX-4352-0.pdf
|
!PDFBOX-4352-0.pdf
|
||||||
!REDHAT-1531897-0.pdf
|
!REDHAT-1531897-0.pdf
|
||||||
!xfa_issue14315.pdf
|
!xfa_issue14315.pdf
|
||||||
|
!poppler-67295-0.pdf
|
||||||
|
!poppler-85140-0.pdf
|
||||||
|
40
test/pdfs/poppler-67295-0.pdf
Normal file
40
test/pdfs/poppler-67295-0.pdf
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
%PDF-1.2
|
||||||
|
1 0 obj
|
||||||
|
<</Type /Catalog /Outlines 2 0 R /Pages 6 0 R>>
|
||||||
|
endobj
|
||||||
|
2 0 obj
|
||||||
|
<</Type /Outlines /Count 0>>
|
||||||
|
endobj
|
||||||
|
3 0 obj
|
||||||
|
<</Length 44>>
|
||||||
|
stream
|
||||||
|
BT /F1 24 Tf 20 750 Td (TestString123) Tj ET
|
||||||
|
endstream
|
||||||
|
endobj
|
||||||
|
4 0 obj
|
||||||
|
[/PDF /Text]
|
||||||
|
endobj
|
||||||
|
5 0 obj
|
||||||
|
<</Type /Font /Subtype /Type1 /Name /F1 /BaseFont /Helvetica /Encoding /MacRomanEncoding>>
|
||||||
|
endobj
|
||||||
|
6 0 obj
|
||||||
|
<</Type /Pages /Kids [7 0 R] /Count 9999999999>>
|
||||||
|
endobj
|
||||||
|
7 0 obj
|
||||||
|
<</Type /Page /Parent 6 0 R /MediaBox [0 0 612 792] /Contents 3 0 R /Resources <</ProcSet 4 0 R /Font <</F1 5 0 R>>>>>>
|
||||||
|
endobj
|
||||||
|
xref
|
||||||
|
0 8
|
||||||
|
0000000000 65535 f
|
||||||
|
0000000010 00000 n
|
||||||
|
0000000076 00000 n
|
||||||
|
0000000123 00000 n
|
||||||
|
0000000221 00000 n
|
||||||
|
0000000252 00000 n
|
||||||
|
0000000361 00000 n
|
||||||
|
0000000428 00000 n
|
||||||
|
trailer
|
||||||
|
<</Size 8 /Root 1 0 R>>
|
||||||
|
startxref
|
||||||
|
566
|
||||||
|
%%EOF
|
36
test/pdfs/poppler-85140-0.pdf
Normal file
36
test/pdfs/poppler-85140-0.pdf
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
%PDF-1.4
|
||||||
|
1 0 obj
|
||||||
|
<<
|
||||||
|
/Type /Catalog
|
||||||
|
/Pages 2 0 R
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
2 0 obj
|
||||||
|
<<
|
||||||
|
/Type /Pages
|
||||||
|
/Kids [3 0 R]
|
||||||
|
/Count 213804087
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
3 18446744073709551616 obj
|
||||||
|
<<
|
||||||
|
/Type /Page
|
||||||
|
/Parent 2 0 R
|
||||||
|
/MediaBox [0 0 595 2147483647]
|
||||||
|
/Contents 4 0 R
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
4 233245 obj
|
||||||
|
<<
|
||||||
|
>>
|
||||||
|
stream
|
||||||
|
endstream
|
||||||
|
endobj
|
||||||
|
5 0 obj
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
trailer
|
||||||
|
<<
|
||||||
|
/Root 1 0 R
|
||||||
|
>>
|
||||||
|
%%EOF
|
@ -25,6 +25,7 @@ import {
|
|||||||
PasswordResponses,
|
PasswordResponses,
|
||||||
PermissionFlag,
|
PermissionFlag,
|
||||||
StreamType,
|
StreamType,
|
||||||
|
UnknownErrorException,
|
||||||
} from "../../src/shared/util.js";
|
} from "../../src/shared/util.js";
|
||||||
import {
|
import {
|
||||||
buildGetDocumentParams,
|
buildGetDocumentParams,
|
||||||
@ -478,6 +479,38 @@ describe("api", function () {
|
|||||||
|
|
||||||
await loadingTask.destroy();
|
await loadingTask.destroy();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("creates pdf doc from PDF files, with bad /Pages tree /Count", async function () {
|
||||||
|
const loadingTask1 = getDocument(
|
||||||
|
buildGetDocumentParams("poppler-67295-0.pdf")
|
||||||
|
);
|
||||||
|
const loadingTask2 = getDocument(
|
||||||
|
buildGetDocumentParams("poppler-85140-0.pdf")
|
||||||
|
);
|
||||||
|
expect(loadingTask1 instanceof PDFDocumentLoadingTask).toEqual(true);
|
||||||
|
expect(loadingTask2 instanceof PDFDocumentLoadingTask).toEqual(true);
|
||||||
|
|
||||||
|
const pdfDocument1 = await loadingTask1.promise;
|
||||||
|
const pdfDocument2 = await loadingTask2.promise;
|
||||||
|
|
||||||
|
expect(pdfDocument1.numPages).toEqual(1);
|
||||||
|
expect(pdfDocument2.numPages).toEqual(1);
|
||||||
|
|
||||||
|
const pageA = await pdfDocument1.getPage(1);
|
||||||
|
expect(pageA instanceof PDFPageProxy).toEqual(true);
|
||||||
|
|
||||||
|
try {
|
||||||
|
await pdfDocument2.getPage(1);
|
||||||
|
|
||||||
|
// Shouldn't get here.
|
||||||
|
expect(false).toEqual(true);
|
||||||
|
} catch (reason) {
|
||||||
|
expect(reason instanceof UnknownErrorException).toEqual(true);
|
||||||
|
expect(reason.message).toEqual("Bad (uncompressed) XRef entry: 3R");
|
||||||
|
}
|
||||||
|
|
||||||
|
await Promise.all([loadingTask1.destroy(), loadingTask2.destroy()]);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe("PDFWorker", function () {
|
describe("PDFWorker", function () {
|
||||||
@ -683,7 +716,7 @@ describe("api", function () {
|
|||||||
throw new Error("shall fail for invalid page");
|
throw new Error("shall fail for invalid page");
|
||||||
},
|
},
|
||||||
function (reason) {
|
function (reason) {
|
||||||
expect(reason instanceof Error).toEqual(true);
|
expect(reason instanceof UnknownErrorException).toEqual(true);
|
||||||
expect(reason.message).toEqual(
|
expect(reason.message).toEqual(
|
||||||
"Pages tree contains circular reference."
|
"Pages tree contains circular reference."
|
||||||
);
|
);
|
||||||
@ -724,7 +757,10 @@ describe("api", function () {
|
|||||||
// Shouldn't get here.
|
// Shouldn't get here.
|
||||||
expect(false).toEqual(true);
|
expect(false).toEqual(true);
|
||||||
} catch (reason) {
|
} catch (reason) {
|
||||||
expect(reason instanceof Error).toEqual(true);
|
expect(reason instanceof UnknownErrorException).toEqual(true);
|
||||||
|
expect(reason.message).toEqual(
|
||||||
|
"The reference does not point to a /Page dictionary."
|
||||||
|
);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user