Merge pull request #15854 from Snuffleupagus/issue-15803

Re-factor searching for incomplete objects in `XRef.indexObjects` (issue 15803)
This commit is contained in:
Jonas Jenwald 2022-12-24 10:23:39 +01:00 committed by GitHub
commit 8aed0c3613
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 39 additions and 39 deletions

View File

@ -431,16 +431,14 @@ class XRef {
} }
return skipped; return skipped;
} }
const gEndobjRegExp = /\b(endobj|\d+\s+\d+\s+obj|xref|trailer)\b/g;
const gStartxrefRegExp = /\b(startxref|\d+\s+\d+\s+obj)\b/g;
const objRegExp = /^(\d+)\s+(\d+)\s+obj\b/; const objRegExp = /^(\d+)\s+(\d+)\s+obj\b/;
const endobjRegExp = /\bendobj[\b\s]$/;
const nestedObjRegExp = /\s+(\d+\s+\d+\s+obj[\b\s<])$/;
const CHECK_CONTENT_LENGTH = 25;
const trailerBytes = new Uint8Array([116, 114, 97, 105, 108, 101, 114]); const trailerBytes = new Uint8Array([116, 114, 97, 105, 108, 101, 114]);
const startxrefBytes = new Uint8Array([ const startxrefBytes = new Uint8Array([
115, 116, 97, 114, 116, 120, 114, 101, 102, 115, 116, 97, 114, 116, 120, 114, 101, 102,
]); ]);
const objBytes = new Uint8Array([111, 98, 106]);
const xrefBytes = new Uint8Array([47, 88, 82, 101, 102]); const xrefBytes = new Uint8Array([47, 88, 82, 101, 102]);
// Clear out any existing entries, since they may be bogus. // Clear out any existing entries, since they may be bogus.
@ -450,6 +448,7 @@ class XRef {
const stream = this.stream; const stream = this.stream;
stream.pos = 0; stream.pos = 0;
const buffer = stream.getBytes(), const buffer = stream.getBytes(),
bufferStr = bytesToString(buffer),
length = buffer.length; length = buffer.length;
let position = stream.start; let position = stream.start;
const trailers = [], const trailers = [],
@ -484,8 +483,8 @@ class XRef {
const num = m[1] | 0, const num = m[1] | 0,
gen = m[2] | 0; gen = m[2] | 0;
const startPos = position + token.length;
let contentLength, let contentLength,
startPos = position + token.length,
updateEntries = false; updateEntries = false;
if (!this.entries[num]) { if (!this.entries[num]) {
updateEntries = true; updateEntries = true;
@ -519,31 +518,22 @@ class XRef {
// Find the next "obj" string, rather than "endobj", to ensure that // Find the next "obj" string, rather than "endobj", to ensure that
// we won't skip over a new 'obj' operator in corrupt files where // we won't skip over a new 'obj' operator in corrupt files where
// 'endobj' operators are missing (fixes issue9105_reduced.pdf). // 'endobj' operators are missing (fixes issue9105_reduced.pdf).
while (startPos < length) { gEndobjRegExp.lastIndex = startPos;
const endPos = startPos + skipUntil(buffer, startPos, objBytes) + 4; const match = gEndobjRegExp.exec(bufferStr);
if (match) {
const endPos = gEndobjRegExp.lastIndex + 1;
contentLength = endPos - position; contentLength = endPos - position;
const checkPos = Math.max(endPos - CHECK_CONTENT_LENGTH, startPos); if (match[1] !== "endobj") {
const tokenStr = bytesToString(buffer.subarray(checkPos, endPos)); warn(
`indexObjects: Found "${match[1]}" inside of another "obj", ` +
// Check if the current object ends with an 'endobj' operator. 'caused by missing "endobj" -- trying to recover.'
if (endobjRegExp.test(tokenStr)) { );
break; contentLength -= match[1].length + 1;
} else {
// Check if an "obj" occurrence is actually a new object,
// i.e. the current object is missing the 'endobj' operator.
const objToken = nestedObjRegExp.exec(tokenStr);
if (objToken && objToken[1]) {
warn(
'indexObjects: Found new "obj" inside of another "obj", ' +
'caused by missing "endobj" -- trying to recover.'
);
contentLength -= objToken[1].length;
break;
}
} }
startPos = endPos; } else {
contentLength = length - position;
} }
const content = buffer.subarray(position, position + contentLength); const content = buffer.subarray(position, position + contentLength);
@ -562,26 +552,26 @@ class XRef {
) { ) {
trailers.push(position); trailers.push(position);
const contentLength = skipUntil(buffer, position, startxrefBytes); const startPos = position + token.length;
let contentLength;
// Attempt to handle (some) corrupt documents, where no 'startxref' // Attempt to handle (some) corrupt documents, where no 'startxref'
// operators are present (fixes issue15590.pdf). // operators are present (fixes issue15590.pdf).
if (position + contentLength >= length) { gStartxrefRegExp.lastIndex = startPos;
const endPos = position + skipUntil(buffer, position, objBytes) + 4; const match = gStartxrefRegExp.exec(bufferStr);
const checkPos = Math.max(endPos - CHECK_CONTENT_LENGTH, position); if (match) {
const tokenStr = bytesToString(buffer.subarray(checkPos, endPos)); const endPos = gStartxrefRegExp.lastIndex + 1;
contentLength = endPos - position;
// Find the first "obj" occurrence after the 'trailer' operator. if (match[1] !== "startxref") {
const objToken = nestedObjRegExp.exec(tokenStr);
if (objToken && objToken[1]) {
warn( warn(
'indexObjects: Found first "obj" after "trailer", ' + `indexObjects: Found "${match[1]}" after "trailer", ` +
'caused by missing "startxref" -- trying to recover.' 'caused by missing "startxref" -- trying to recover.'
); );
position = endPos - objToken[1].length; contentLength -= match[1].length + 1;
continue;
} }
} else {
contentLength = length - position;
} }
position += contentLength; position += contentLength;
} else { } else {

View File

@ -0,0 +1 @@
https://github.com/mozilla/pdf.js/files/10200431/ocg.pdf

View File

@ -1761,6 +1761,15 @@
"link": false, "link": false,
"type": "eq" "type": "eq"
}, },
{ "id": "issue15803",
"file": "pdfs/issue15803.pdf",
"md5": "e501a4418d4ece5be7ce4e8acf029100",
"rounds": 1,
"link": true,
"lastPage": 1,
"type": "eq",
"annotations": true
},
{ "id": "issue9105_other", { "id": "issue9105_other",
"file": "pdfs/issue9105_other.pdf", "file": "pdfs/issue9105_other.pdf",
"md5": "4c8b9c2cceb9c5d621e1d50b3dc38efc", "md5": "4c8b9c2cceb9c5d621e1d50b3dc38efc",