diff --git a/src/core/obj.js b/src/core/obj.js index 6af60f1f3..c7058689e 100644 --- a/src/core/obj.js +++ b/src/core/obj.js @@ -1102,10 +1102,14 @@ var XRef = (function XRefClosure() { return skipped; } var objRegExp = /^(\d+)\s+(\d+)\s+obj\b/; + const endobjRegExp = /\bendobj[\b\s]$/; + const nestedObjRegExp = /\s+(\d+\s+\d+\s+obj[\b\s])$/; + const CHECK_CONTENT_LENGTH = 25; + var trailerBytes = new Uint8Array([116, 114, 97, 105, 108, 101, 114]); var startxrefBytes = new Uint8Array([115, 116, 97, 114, 116, 120, 114, 101, 102]); - var endobjBytes = new Uint8Array([101, 110, 100, 111, 98, 106]); + const objBytes = new Uint8Array([111, 98, 106]); var xrefBytes = new Uint8Array([47, 88, 82, 101, 102]); // Clear out any existing entries, since they may be bogus. @@ -1147,8 +1151,36 @@ var XRef = (function XRefClosure() { uncompressed: true, }; } - var contentLength = skipUntil(buffer, position, endobjBytes) + 7; - var content = buffer.subarray(position, position + contentLength); + let contentLength, startPos = position + token.length; + + // Find the next "obj" string, rather than "endobj", to ensure that + // we won't skip over a new 'obj' operator in corrupt files where + // 'endobj' operators are missing (fixes issue9105_reduced.pdf). + while (startPos < buffer.length) { + let endPos = startPos + skipUntil(buffer, startPos, objBytes) + 4; + contentLength = endPos - position; + + let checkPos = Math.max(endPos - CHECK_CONTENT_LENGTH, startPos); + let tokenStr = bytesToString(buffer.subarray(checkPos, endPos)); + + // Check if the current object ends with an 'endobj' operator. + if (endobjRegExp.test(tokenStr)) { + break; + } else { + // Check if an "obj" occurance is actually a new object, + // i.e. the current object is missing the 'endobj' operator. + let objToken = nestedObjRegExp.exec(tokenStr); + + if (objToken && objToken[1]) { + warn('indexObjects: Found new "obj" inside of another "obj", ' + + 'caused by missing "endobj" -- trying to recover.'); + contentLength -= objToken[1].length; + break; + } + } + startPos += contentLength; + } + let content = buffer.subarray(position, position + contentLength); // checking XRef stream suspect // (it shall have '/XRef' and next char is not a letter) diff --git a/test/pdfs/.gitignore b/test/pdfs/.gitignore index 6ec4eeb40..9b2378ecd 100644 --- a/test/pdfs/.gitignore +++ b/test/pdfs/.gitignore @@ -64,6 +64,7 @@ !issue8798r.pdf !issue8823.pdf !issue9084.pdf +!issue9105_reduced.pdf !bad-PageLabels.pdf !filled-background.pdf !ArabicCIDTrueType.pdf diff --git a/test/pdfs/issue9105_reduced.pdf b/test/pdfs/issue9105_reduced.pdf new file mode 100644 index 000000000..39412e285 --- /dev/null +++ b/test/pdfs/issue9105_reduced.pdf @@ -0,0 +1,74 @@ +%PDF-1.7 +%âãÏÓ +1 0 obj +<< +/Title (Issue 9105) +/Author (Snuffleupagus) +>> +2 0 obj +<< +/Pages 3 0 R +/Type /Catalog +>> +endobj +3 0 obj +<< +/Kids [4 0 R] +/Count 1 +/Type /Pages +>> +endobj +4 0 obj +<< +/Parent 3 0 R +/MediaBox [0 0 200 50] +/Resources +<< +/Font +<< +/F1 5 0 R +>> +>> +/Contents 6 0 R +/Type /Page +>> +endobj +5 0 obj +<< +/BaseFont /Times-Roman +/Subtype /Type1 +/Encoding /WinAnsiEncoding +/Type /Font +>> +endobj +6 0 obj +<< +/Length 41 +>> +stream +BT +10 20 TD +/F1 20 Tf +(Issue 9105) Tj +ET + +endstream +endobj xref +0 7 +0000000000 65535 f +0000000001 00000 n +0000000002 00000 n +0000000003 00000 n +0000000004 00000 n +0000000005 00000 n +0000000006 00000 n +trailer + +<< +/Info 1 0 R +/Root 2 0 R +/Size 7 +>> +startxref +491 +%%EOF diff --git a/test/test_manifest.json b/test/test_manifest.json index 8c8a85eea..f766ccd62 100644 --- a/test/test_manifest.json +++ b/test/test_manifest.json @@ -741,6 +741,13 @@ "lastPage": 1, "type": "eq" }, + { "id": "issue9105", + "file": "pdfs/issue9105_reduced.pdf", + "md5": "f3889f7c7b60e1ab998aac430cc7e08e", + "rounds": 1, + "link": false, + "type": "eq" + }, { "id": "issue6289", "file": "pdfs/issue6289.pdf", "md5": "0869f3d147c734ec484ffd492104095d",