Merge pull request #9288 from Snuffleupagus/issue-9105-2

Handle PDF files with missing 'endobj' operators, by searching for the "obj" string rather than "endobj" in `XRef.indexObjects` (issue 9105)
This commit is contained in:
Tim van der Meij 2017-12-18 23:05:38 +01:00 committed by GitHub
commit 8ae3fd49f9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 117 additions and 3 deletions

View File

@ -1102,10 +1102,14 @@ var XRef = (function XRefClosure() {
return skipped;
}
var objRegExp = /^(\d+)\s+(\d+)\s+obj\b/;
const endobjRegExp = /\bendobj[\b\s]$/;
const nestedObjRegExp = /\s+(\d+\s+\d+\s+obj[\b\s])$/;
const CHECK_CONTENT_LENGTH = 25;
var trailerBytes = new Uint8Array([116, 114, 97, 105, 108, 101, 114]);
var startxrefBytes = new Uint8Array([115, 116, 97, 114, 116, 120, 114,
101, 102]);
var endobjBytes = new Uint8Array([101, 110, 100, 111, 98, 106]);
const objBytes = new Uint8Array([111, 98, 106]);
var xrefBytes = new Uint8Array([47, 88, 82, 101, 102]);
// Clear out any existing entries, since they may be bogus.
@ -1147,8 +1151,36 @@ var XRef = (function XRefClosure() {
uncompressed: true,
};
}
var contentLength = skipUntil(buffer, position, endobjBytes) + 7;
var content = buffer.subarray(position, position + contentLength);
let contentLength, startPos = position + token.length;
// Find the next "obj" string, rather than "endobj", to ensure that
// we won't skip over a new 'obj' operator in corrupt files where
// 'endobj' operators are missing (fixes issue9105_reduced.pdf).
while (startPos < buffer.length) {
let endPos = startPos + skipUntil(buffer, startPos, objBytes) + 4;
contentLength = endPos - position;
let checkPos = Math.max(endPos - CHECK_CONTENT_LENGTH, startPos);
let tokenStr = bytesToString(buffer.subarray(checkPos, endPos));
// Check if the current object ends with an 'endobj' operator.
if (endobjRegExp.test(tokenStr)) {
break;
} else {
// Check if an "obj" occurance is actually a new object,
// i.e. the current object is missing the 'endobj' operator.
let objToken = nestedObjRegExp.exec(tokenStr);
if (objToken && objToken[1]) {
warn('indexObjects: Found new "obj" inside of another "obj", ' +
'caused by missing "endobj" -- trying to recover.');
contentLength -= objToken[1].length;
break;
}
}
startPos += contentLength;
}
let content = buffer.subarray(position, position + contentLength);
// checking XRef stream suspect
// (it shall have '/XRef' and next char is not a letter)

View File

@ -64,6 +64,7 @@
!issue8798r.pdf
!issue8823.pdf
!issue9084.pdf
!issue9105_reduced.pdf
!bad-PageLabels.pdf
!filled-background.pdf
!ArabicCIDTrueType.pdf

View File

@ -0,0 +1,74 @@
%PDF-1.7
%âãÏÓ
1 0 obj
<<
/Title (Issue 9105)
/Author (Snuffleupagus)
>>
2 0 obj
<<
/Pages 3 0 R
/Type /Catalog
>>
endobj
3 0 obj
<<
/Kids [4 0 R]
/Count 1
/Type /Pages
>>
endobj
4 0 obj
<<
/Parent 3 0 R
/MediaBox [0 0 200 50]
/Resources
<<
/Font
<<
/F1 5 0 R
>>
>>
/Contents 6 0 R
/Type /Page
>>
endobj
5 0 obj
<<
/BaseFont /Times-Roman
/Subtype /Type1
/Encoding /WinAnsiEncoding
/Type /Font
>>
endobj
6 0 obj
<<
/Length 41
>>
stream
BT
10 20 TD
/F1 20 Tf
(Issue 9105) Tj
ET
endstream
endobj xref
0 7
0000000000 65535 f
0000000001 00000 n
0000000002 00000 n
0000000003 00000 n
0000000004 00000 n
0000000005 00000 n
0000000006 00000 n
trailer
<<
/Info 1 0 R
/Root 2 0 R
/Size 7
>>
startxref
491
%%EOF

View File

@ -741,6 +741,13 @@
"lastPage": 1,
"type": "eq"
},
{ "id": "issue9105",
"file": "pdfs/issue9105_reduced.pdf",
"md5": "f3889f7c7b60e1ab998aac430cc7e08e",
"rounds": 1,
"link": false,
"type": "eq"
},
{ "id": "issue6289",
"file": "pdfs/issue6289.pdf",
"md5": "0869f3d147c734ec484ffd492104095d",