Merge pull request #15854 from Snuffleupagus/issue-15803

Re-factor searching for incomplete objects in `XRef.indexObjects` (issue 15803)
This commit is contained in:
Jonas Jenwald 2022-12-24 10:23:39 +01:00 committed by GitHub
commit 8aed0c3613
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 39 additions and 39 deletions

View File

@ -431,16 +431,14 @@ class XRef {
}
return skipped;
}
const gEndobjRegExp = /\b(endobj|\d+\s+\d+\s+obj|xref|trailer)\b/g;
const gStartxrefRegExp = /\b(startxref|\d+\s+\d+\s+obj)\b/g;
const objRegExp = /^(\d+)\s+(\d+)\s+obj\b/;
const endobjRegExp = /\bendobj[\b\s]$/;
const nestedObjRegExp = /\s+(\d+\s+\d+\s+obj[\b\s<])$/;
const CHECK_CONTENT_LENGTH = 25;
const trailerBytes = new Uint8Array([116, 114, 97, 105, 108, 101, 114]);
const startxrefBytes = new Uint8Array([
115, 116, 97, 114, 116, 120, 114, 101, 102,
]);
const objBytes = new Uint8Array([111, 98, 106]);
const xrefBytes = new Uint8Array([47, 88, 82, 101, 102]);
// Clear out any existing entries, since they may be bogus.
@ -450,6 +448,7 @@ class XRef {
const stream = this.stream;
stream.pos = 0;
const buffer = stream.getBytes(),
bufferStr = bytesToString(buffer),
length = buffer.length;
let position = stream.start;
const trailers = [],
@ -484,8 +483,8 @@ class XRef {
const num = m[1] | 0,
gen = m[2] | 0;
const startPos = position + token.length;
let contentLength,
startPos = position + token.length,
updateEntries = false;
if (!this.entries[num]) {
updateEntries = true;
@ -519,31 +518,22 @@ class XRef {
// Find the next "obj" string, rather than "endobj", to ensure that
// we won't skip over a new 'obj' operator in corrupt files where
// 'endobj' operators are missing (fixes issue9105_reduced.pdf).
while (startPos < length) {
const endPos = startPos + skipUntil(buffer, startPos, objBytes) + 4;
gEndobjRegExp.lastIndex = startPos;
const match = gEndobjRegExp.exec(bufferStr);
if (match) {
const endPos = gEndobjRegExp.lastIndex + 1;
contentLength = endPos - position;
const checkPos = Math.max(endPos - CHECK_CONTENT_LENGTH, startPos);
const tokenStr = bytesToString(buffer.subarray(checkPos, endPos));
// Check if the current object ends with an 'endobj' operator.
if (endobjRegExp.test(tokenStr)) {
break;
} else {
// Check if an "obj" occurrence is actually a new object,
// i.e. the current object is missing the 'endobj' operator.
const objToken = nestedObjRegExp.exec(tokenStr);
if (objToken && objToken[1]) {
warn(
'indexObjects: Found new "obj" inside of another "obj", ' +
'caused by missing "endobj" -- trying to recover.'
);
contentLength -= objToken[1].length;
break;
}
if (match[1] !== "endobj") {
warn(
`indexObjects: Found "${match[1]}" inside of another "obj", ` +
'caused by missing "endobj" -- trying to recover.'
);
contentLength -= match[1].length + 1;
}
startPos = endPos;
} else {
contentLength = length - position;
}
const content = buffer.subarray(position, position + contentLength);
@ -562,26 +552,26 @@ class XRef {
) {
trailers.push(position);
const contentLength = skipUntil(buffer, position, startxrefBytes);
const startPos = position + token.length;
let contentLength;
// Attempt to handle (some) corrupt documents, where no 'startxref'
// operators are present (fixes issue15590.pdf).
if (position + contentLength >= length) {
const endPos = position + skipUntil(buffer, position, objBytes) + 4;
gStartxrefRegExp.lastIndex = startPos;
const match = gStartxrefRegExp.exec(bufferStr);
const checkPos = Math.max(endPos - CHECK_CONTENT_LENGTH, position);
const tokenStr = bytesToString(buffer.subarray(checkPos, endPos));
if (match) {
const endPos = gStartxrefRegExp.lastIndex + 1;
contentLength = endPos - position;
// Find the first "obj" occurrence after the 'trailer' operator.
const objToken = nestedObjRegExp.exec(tokenStr);
if (objToken && objToken[1]) {
if (match[1] !== "startxref") {
warn(
'indexObjects: Found first "obj" after "trailer", ' +
`indexObjects: Found "${match[1]}" after "trailer", ` +
'caused by missing "startxref" -- trying to recover.'
);
position = endPos - objToken[1].length;
continue;
contentLength -= match[1].length + 1;
}
} else {
contentLength = length - position;
}
position += contentLength;
} else {

View File

@ -0,0 +1 @@
https://github.com/mozilla/pdf.js/files/10200431/ocg.pdf

View File

@ -1761,6 +1761,15 @@
"link": false,
"type": "eq"
},
{ "id": "issue15803",
"file": "pdfs/issue15803.pdf",
"md5": "e501a4418d4ece5be7ce4e8acf029100",
"rounds": 1,
"link": true,
"lastPage": 1,
"type": "eq",
"annotations": true
},
{ "id": "issue9105_other",
"file": "pdfs/issue9105_other.pdf",
"md5": "4c8b9c2cceb9c5d621e1d50b3dc38efc",