Handle PDF files with missing 'endobj' operators, by searching for the "obj" string rather than "endobj" in XRef.indexObjects
(issue 9105)
This patch refactors the searching for 'endobj', to try and find the next occurance of "obj" and then check if it was in fact an 'endobj' and continue searching otherwise. This approach is used to avoid having to first find 'endobj', and then re-check the entire contents of the object and having to run (potentially expensive) regular expressions on arbitrary long strings. Fixes 9105.
This commit is contained in:
parent
6bbe91079b
commit
1dc54ddb40
@ -1102,10 +1102,14 @@ var XRef = (function XRefClosure() {
|
||||
return skipped;
|
||||
}
|
||||
var objRegExp = /^(\d+)\s+(\d+)\s+obj\b/;
|
||||
const endobjRegExp = /\bendobj[\b\s]$/;
|
||||
const nestedObjRegExp = /\s+(\d+\s+\d+\s+obj[\b\s])$/;
|
||||
const CHECK_CONTENT_LENGTH = 25;
|
||||
|
||||
var trailerBytes = new Uint8Array([116, 114, 97, 105, 108, 101, 114]);
|
||||
var startxrefBytes = new Uint8Array([115, 116, 97, 114, 116, 120, 114,
|
||||
101, 102]);
|
||||
var endobjBytes = new Uint8Array([101, 110, 100, 111, 98, 106]);
|
||||
const objBytes = new Uint8Array([111, 98, 106]);
|
||||
var xrefBytes = new Uint8Array([47, 88, 82, 101, 102]);
|
||||
|
||||
// Clear out any existing entries, since they may be bogus.
|
||||
@ -1147,8 +1151,36 @@ var XRef = (function XRefClosure() {
|
||||
uncompressed: true,
|
||||
};
|
||||
}
|
||||
var contentLength = skipUntil(buffer, position, endobjBytes) + 7;
|
||||
var content = buffer.subarray(position, position + contentLength);
|
||||
let contentLength, startPos = position + token.length;
|
||||
|
||||
// Find the next "obj" string, rather than "endobj", to ensure that
|
||||
// we won't skip over a new 'obj' operator in corrupt files where
|
||||
// 'endobj' operators are missing (fixes issue9105_reduced.pdf).
|
||||
while (startPos < buffer.length) {
|
||||
let endPos = startPos + skipUntil(buffer, startPos, objBytes) + 4;
|
||||
contentLength = endPos - position;
|
||||
|
||||
let checkPos = Math.max(endPos - CHECK_CONTENT_LENGTH, startPos);
|
||||
let tokenStr = bytesToString(buffer.subarray(checkPos, endPos));
|
||||
|
||||
// Check if the current object ends with an 'endobj' operator.
|
||||
if (endobjRegExp.test(tokenStr)) {
|
||||
break;
|
||||
} else {
|
||||
// Check if an "obj" occurance is actually a new object,
|
||||
// i.e. the current object is missing the 'endobj' operator.
|
||||
let objToken = nestedObjRegExp.exec(tokenStr);
|
||||
|
||||
if (objToken && objToken[1]) {
|
||||
warn('indexObjects: Found new "obj" inside of another "obj", ' +
|
||||
'caused by missing "endobj" -- trying to recover.');
|
||||
contentLength -= objToken[1].length;
|
||||
break;
|
||||
}
|
||||
}
|
||||
startPos += contentLength;
|
||||
}
|
||||
let content = buffer.subarray(position, position + contentLength);
|
||||
|
||||
// checking XRef stream suspect
|
||||
// (it shall have '/XRef' and next char is not a letter)
|
||||
|
1
test/pdfs/.gitignore
vendored
1
test/pdfs/.gitignore
vendored
@ -64,6 +64,7 @@
|
||||
!issue8798r.pdf
|
||||
!issue8823.pdf
|
||||
!issue9084.pdf
|
||||
!issue9105_reduced.pdf
|
||||
!bad-PageLabels.pdf
|
||||
!filled-background.pdf
|
||||
!ArabicCIDTrueType.pdf
|
||||
|
74
test/pdfs/issue9105_reduced.pdf
Normal file
74
test/pdfs/issue9105_reduced.pdf
Normal file
@ -0,0 +1,74 @@
|
||||
%PDF-1.7
|
||||
%âãÏÓ
|
||||
1 0 obj
|
||||
<<
|
||||
/Title (Issue 9105)
|
||||
/Author (Snuffleupagus)
|
||||
>>
|
||||
2 0 obj
|
||||
<<
|
||||
/Pages 3 0 R
|
||||
/Type /Catalog
|
||||
>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<<
|
||||
/Kids [4 0 R]
|
||||
/Count 1
|
||||
/Type /Pages
|
||||
>>
|
||||
endobj
|
||||
4 0 obj
|
||||
<<
|
||||
/Parent 3 0 R
|
||||
/MediaBox [0 0 200 50]
|
||||
/Resources
|
||||
<<
|
||||
/Font
|
||||
<<
|
||||
/F1 5 0 R
|
||||
>>
|
||||
>>
|
||||
/Contents 6 0 R
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<<
|
||||
/BaseFont /Times-Roman
|
||||
/Subtype /Type1
|
||||
/Encoding /WinAnsiEncoding
|
||||
/Type /Font
|
||||
>>
|
||||
endobj
|
||||
6 0 obj
|
||||
<<
|
||||
/Length 41
|
||||
>>
|
||||
stream
|
||||
BT
|
||||
10 20 TD
|
||||
/F1 20 Tf
|
||||
(Issue 9105) Tj
|
||||
ET
|
||||
|
||||
endstream
|
||||
endobj xref
|
||||
0 7
|
||||
0000000000 65535 f
|
||||
0000000001 00000 n
|
||||
0000000002 00000 n
|
||||
0000000003 00000 n
|
||||
0000000004 00000 n
|
||||
0000000005 00000 n
|
||||
0000000006 00000 n
|
||||
trailer
|
||||
|
||||
<<
|
||||
/Info 1 0 R
|
||||
/Root 2 0 R
|
||||
/Size 7
|
||||
>>
|
||||
startxref
|
||||
491
|
||||
%%EOF
|
@ -741,6 +741,13 @@
|
||||
"lastPage": 1,
|
||||
"type": "eq"
|
||||
},
|
||||
{ "id": "issue9105",
|
||||
"file": "pdfs/issue9105_reduced.pdf",
|
||||
"md5": "f3889f7c7b60e1ab998aac430cc7e08e",
|
||||
"rounds": 1,
|
||||
"link": false,
|
||||
"type": "eq"
|
||||
},
|
||||
{ "id": "issue6289",
|
||||
"file": "pdfs/issue6289.pdf",
|
||||
"md5": "0869f3d147c734ec484ffd492104095d",
|
||||
|
Loading…
Reference in New Issue
Block a user