Handle PDF files with missing 'endobj' operators, by searching for the "obj" string rather than "endobj" in XRef.indexObjects (issue 9105)

This patch refactors the searching for 'endobj', to try and find the next occurance of "obj" and then check if it was in fact an 'endobj' and continue searching otherwise.
This approach is used to avoid having to first find 'endobj', and then re-check the entire contents of the object and having to run (potentially expensive) regular expressions on arbitrary long strings.

Fixes 9105.
This commit is contained in:
Jonas Jenwald 2017-12-08 16:37:12 +01:00
parent 6bbe91079b
commit 1dc54ddb40
4 changed files with 117 additions and 3 deletions

View File

@ -1102,10 +1102,14 @@ var XRef = (function XRefClosure() {
return skipped;
}
var objRegExp = /^(\d+)\s+(\d+)\s+obj\b/;
const endobjRegExp = /\bendobj[\b\s]$/;
const nestedObjRegExp = /\s+(\d+\s+\d+\s+obj[\b\s])$/;
const CHECK_CONTENT_LENGTH = 25;
var trailerBytes = new Uint8Array([116, 114, 97, 105, 108, 101, 114]);
var startxrefBytes = new Uint8Array([115, 116, 97, 114, 116, 120, 114,
101, 102]);
var endobjBytes = new Uint8Array([101, 110, 100, 111, 98, 106]);
const objBytes = new Uint8Array([111, 98, 106]);
var xrefBytes = new Uint8Array([47, 88, 82, 101, 102]);
// Clear out any existing entries, since they may be bogus.
@ -1147,8 +1151,36 @@ var XRef = (function XRefClosure() {
uncompressed: true,
};
}
var contentLength = skipUntil(buffer, position, endobjBytes) + 7;
var content = buffer.subarray(position, position + contentLength);
let contentLength, startPos = position + token.length;
// Find the next "obj" string, rather than "endobj", to ensure that
// we won't skip over a new 'obj' operator in corrupt files where
// 'endobj' operators are missing (fixes issue9105_reduced.pdf).
while (startPos < buffer.length) {
let endPos = startPos + skipUntil(buffer, startPos, objBytes) + 4;
contentLength = endPos - position;
let checkPos = Math.max(endPos - CHECK_CONTENT_LENGTH, startPos);
let tokenStr = bytesToString(buffer.subarray(checkPos, endPos));
// Check if the current object ends with an 'endobj' operator.
if (endobjRegExp.test(tokenStr)) {
break;
} else {
// Check if an "obj" occurance is actually a new object,
// i.e. the current object is missing the 'endobj' operator.
let objToken = nestedObjRegExp.exec(tokenStr);
if (objToken && objToken[1]) {
warn('indexObjects: Found new "obj" inside of another "obj", ' +
'caused by missing "endobj" -- trying to recover.');
contentLength -= objToken[1].length;
break;
}
}
startPos += contentLength;
}
let content = buffer.subarray(position, position + contentLength);
// checking XRef stream suspect
// (it shall have '/XRef' and next char is not a letter)

View File

@ -64,6 +64,7 @@
!issue8798r.pdf
!issue8823.pdf
!issue9084.pdf
!issue9105_reduced.pdf
!bad-PageLabels.pdf
!filled-background.pdf
!ArabicCIDTrueType.pdf

View File

@ -0,0 +1,74 @@
%PDF-1.7
%âãÏÓ
1 0 obj
<<
/Title (Issue 9105)
/Author (Snuffleupagus)
>>
2 0 obj
<<
/Pages 3 0 R
/Type /Catalog
>>
endobj
3 0 obj
<<
/Kids [4 0 R]
/Count 1
/Type /Pages
>>
endobj
4 0 obj
<<
/Parent 3 0 R
/MediaBox [0 0 200 50]
/Resources
<<
/Font
<<
/F1 5 0 R
>>
>>
/Contents 6 0 R
/Type /Page
>>
endobj
5 0 obj
<<
/BaseFont /Times-Roman
/Subtype /Type1
/Encoding /WinAnsiEncoding
/Type /Font
>>
endobj
6 0 obj
<<
/Length 41
>>
stream
BT
10 20 TD
/F1 20 Tf
(Issue 9105) Tj
ET
endstream
endobj xref
0 7
0000000000 65535 f
0000000001 00000 n
0000000002 00000 n
0000000003 00000 n
0000000004 00000 n
0000000005 00000 n
0000000006 00000 n
trailer
<<
/Info 1 0 R
/Root 2 0 R
/Size 7
>>
startxref
491
%%EOF

View File

@ -741,6 +741,13 @@
"lastPage": 1,
"type": "eq"
},
{ "id": "issue9105",
"file": "pdfs/issue9105_reduced.pdf",
"md5": "f3889f7c7b60e1ab998aac430cc7e08e",
"rounds": 1,
"link": false,
"type": "eq"
},
{ "id": "issue6289",
"file": "pdfs/issue6289.pdf",
"md5": "0869f3d147c734ec484ffd492104095d",